Example 6-15: Using JTidy to turn HTML into XML

import java.io.*;
import java.net.URL;
import oracle.xml.parser.v2.*;
import org.w3c.dom.*;
import org.w3c.tidy.Tidy;

public class JTidyConverter {
   // Parse a URL returning for a possibly ill-formed HTML page and return
   // a "tidied" up XML document for the page using JTidy
   public XMLDocument XMLifyHTMLFrom(URL u) throws IOException {
      // (1) Construct a new Tidy bean to use for converting HTML to XML
      Tidy tidy = new Tidy();
      // (2) Set some Tidy options to get the best results for "data scraping"
      tidy.setMakeClean(true);
      tidy.setBreakBeforeBR(true);
      tidy.setShowWarnings(false);
      tidy.setOnlyErrors(true);
      tidy.setErrout(new PrintWriter(new StringWriter()));
      // (3) Construct an empty target Oracle XML DOM Document
      XMLDocument xmldocToReturn = new XMLDocument();
      // (4) Get an InputStream of HTML from the URL
      InputStream HTMLInput = u.openStream();
      // (5) Ask Tidy to parse the incoming HTML into an in-memory DOM tree
      Document tidiedHTMLDoc = tidy.parseDOM(u.openStream(), null);
      // (6) Clone the JTidy DOM tree by recursively building up an Oracle DOM copy
      cloneXMLFragment(tidiedHTMLDoc,xmldocToReturn);
      return xmldocToReturn;
   }
   // Recursively build an Oracle XML Parser DOM tree based
   // on walking the JTidy DOM tree of the "tidied" page.
   private void cloneXMLFragment(Node node, Node curTarget) {
      if ( node == null ) return;
      Document d = curTarget instanceof Document ? (Document)curTarget :
                                        curTarget.getOwnerDocument();
      int type = node.getNodeType();
      switch ( type ) {
        // If we get the root node of the document, start the recursion
        // by calling build the Doc Element
        case Node.DOCUMENT_NODE:
           cloneXMLFragment(((Document)node).getDocumentElement(),d);
           break;

        // If we get an Element in the JTidy DOM, create Element in Oracle DOM
        // and append it to the current target node as a child. Also build
        // Oracle DOM attribute nodes for each attrib of the JTidy DOM Element
        case Node.ELEMENT_NODE:
           Element e = d.createElement(node.getNodeName());
           NamedNodeMap attrs = node.getAttributes();
           for ( int i = 0; i < attrs.getLength(); i++ ) {
              e.setAttribute(attrs.item(i).getNodeName(),
                             attrs.item(i).getNodeValue());
           }
           curTarget.appendChild(e);
           NodeList children = node.getChildNodes();

           // Recurse to build any children
           if ( children != null ) {
              int len = children.getLength();
              for ( int i = 0; i < len; i++ ) {
                 cloneXMLFragment(children.item(i),e);
              }
           }
           break;
        // If we get a Text Node in the JTidy DOM, create Text Node in Oracle
        // DOM and append it to the current target node as a child
        case Node.TEXT_NODE:
           curTarget.appendChild(d.createTextNode(node.getNodeValue()));
           break;
      }
   }
}