|
Cobra: Java HTML Parser
The all-Java Cobra HTML Toolkit includes an HTML
DOM parser which can be used independently of the renderer. The
following are some of its features:
- Implementation of W3C HTML DOM Level 2 interfaces.
- It can be used in headless mode.
- It provides incremental notifications of DOM modifications as the document is parsed.
- It provides routines to incrementally modify the DOM, e.g. by
setting the
innerHTML property of an element.
- It is Javascript-aware. DOM modifications that occur during parsing will
be reflected in the resulting DOM. However, Javascript can be disabled.
- It is CSS2-aware.
API Documentation
See the Cobra API Documentation.
Basic Usage
The recommended way to use the Cobra HTML parser is via the DocumentBuilderImpl class,
roughly as follows:
import org.lobobrowser.html.parser.*;
import org.lobobrowser.html.test.*;
import org.lobobrowser.html.*;
import org.w3c.dom.*;
...
UserAgentContext context = new SimpleUserAgentContext();
DocumentBuilderImpl dbi = new DocumentBuilderImpl(context);
// A document URI and a charset should be provided.
Document document = dbi.parse(new InputSourceImpl(inputStream, documentURI, charset));
The
HtmlParser
class can be used directly as well. In particular, it can be used to parse
an HTML document into a third-party DOM implementation, or to parse HTML
below a particular DOM node (which is how the innerHTML property
is implemented).
import org.lobobrowser.html.parser.*;
import org.lobobrowser.html.test.*;
import org.lobobrowser.html.*;
import org.w3c.dom.*;
import org.w3c.dom.html2.*;
...
UserAgentContext context = new SimpleUserAgentContext();
DocumentBuilderImpl dbi = new DocumentBuilderImpl(context);
HTMLDocument document = (HTMLDocument) dbi.createDocument();
...
HtmlParser parser = new HtmlParser(context, document);
parser.parse(myReader, someParentNode);
Performance Tips
Parser performance is typically affected by loading of remote
scripts and CSS documents. There are generally two ways to
deal with this: (1) Disable Javascript and/or CSS, and
(2) Implement some sort of caching mechanism.
All Cobra requests are processed through
UserAgentContext.createHttpRequest(), so the request
routines can be changed by either implementing the UserAgentContext
and HttpRequest interfaces, or by extending simple
implementations of these interfaces provided with Cobra.
Enabling of Javascript is controlled by the
UserAgentContext.isScriptingEnabled() method, so
it is straightforward to disable Javascript by simply
extending SimpleUserAgentContext.
Disabling remote CSS document loading is not so easy at the moment. It requires
extending the HTMLDocumentImpl class, and overridding
its createElement method such that it provides
a dummy element implementation for LINK elements.
Examples
** Image Test:
import org.lobobrowser.html.*;
import org.lobobrowser.html.test.*;
import org.lobobrowser.html.parser.*;
import org.lobobrowser.html.domimpl.*;
import org.w3c.dom.*;
import org.w3c.dom.html2.*;
import java.net.*;
import java.io.*;
public class ParseImagesTest {
private static final String TEST_URI = "http://lobobrowser.org";
public static void main(String[] args) throws Exception {
UserAgentContext uacontext = new SimpleUserAgentContext();
DocumentBuilderImpl builder = new DocumentBuilderImpl(uacontext);
URL url = new URL(TEST_URI);
InputStream in = url.openConnection().getInputStream();
try {
Reader reader = new InputStreamReader(in, "ISO-8859-1");
InputSourceImpl inputSource = new InputSourceImpl(reader, TEST_URI);
Document d = builder.parse(inputSource);
HTMLDocumentImpl document = (HTMLDocumentImpl) d;
HTMLCollection images = document.getImages();
int length = images.getLength();
for(int i = 0; i < length; i++) {
System.out.println("- Image#" + i + ": " + images.item(i));
}
} finally {
in.close();
}
}
}
** Anchors Test:
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.net.URL;
import javax.xml.xpath.*;
import javax.xml.parsers.*;
import java.util.logging.*;
import org.lobobrowser.html.UserAgentContext;
import org.lobobrowser.html.parser.*;
import org.lobobrowser.html.test.SimpleUserAgentContext;
import org.w3c.dom.NodeList;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
public class ParseAnchorsTest {
private static final String TEST_URI = "http://lobobrowser.org";
public static void main(String[] args) throws Exception {
Logger.getLogger("org.lobobrowser").setLevel(Level.WARNING);
UserAgentContext uacontext = new SimpleUserAgentContext();
// In this case we will use a standard XML document
// as opposed to Cobra's HTML DOM implementation.
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
DocumentBuilder builder = factory.newDocumentBuilder();
URL url = new URL(TEST_URI);
InputStream in = url.openConnection().getInputStream();
try {
Reader reader = new InputStreamReader(in, "ISO-8859-1");
Document document = builder.newDocument();
// Here is where we use Cobra's HTML parser.
HtmlParser parser = new HtmlParser(uacontext, document);
parser.parse(reader);
// Now we use XPath to locate "a" elements that are
// descendents of any "html" element.
XPath xpath = XPathFactory.newInstance().newXPath();
NodeList nodeList = (NodeList) xpath.evaluate("html//a", document, XPathConstants.NODESET);
int length = nodeList.getLength();
for(int i = 0; i < length; i++) {
Element element = (Element) nodeList.item(i);
System.out.println("## Anchor # " + i + ": " + element.getAttribute("href"));
}
} finally {
in.close();
}
}
}
*** Scraping test:
import java.io.BufferedInputStream;
import java.io.ByteArrayOutputStream;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLConnection;
import java.net.URLEncoder;
import java.util.logging.*;
import javax.xml.xpath.*;
import org.w3c.dom.NodeList;
import org.w3c.dom.Element;
import org.lobobrowser.html.*;
import org.lobobrowser.html.domimpl.*;
import org.lobobrowser.html.parser.DocumentBuilderImpl;
import org.lobobrowser.html.parser.InputSourceImpl;
import org.lobobrowser.html.test.*;
public class ScrapingTest {
private static final String START_LOCATION = "http://metacrawler.com";
private static final String SEARCH_PHRASE = "java";
private static final String TEXT_FIELD_ID = "qkw";
private static final Logger logger = Logger.getLogger(ScrapingTest.class.getName());
public static void main(String[] args) throws Exception {
UserAgentContext uacontext = new LocalUserAgentContext();
LocalHtmlRendererContext rcontext = new LocalHtmlRendererContext(uacontext);
// First, we navigate to the starting location.
rcontext.navigate(START_LOCATION);
// Next, we search for a form in the resulting
// document, specifically one that has
// class="msSearchbox" (MetaCrawler specific).
HTMLDocumentImpl startingDoc = rcontext.getCurrentDocument();
if(startingDoc == null) throw new IllegalStateException("No document available for startup location.");
XPath xpath = XPathFactory.newInstance().newXPath();
NodeList nodeList = (NodeList) xpath.evaluate("//form[@class='msSearchbox']", startingDoc, XPathConstants.NODESET);
if(nodeList.getLength() == 0) throw new IllegalStateException("Expected search form not found in the page.");
HTMLFormElementImpl form = (HTMLFormElementImpl) nodeList.item(0);
// We now look for the text field where
// the search phrase goes.
HTMLInputElementImpl textInput = (HTMLInputElementImpl) startingDoc.getElementById(TEXT_FIELD_ID);
if(textInput == null) throw new IllegalStateException("Did not find a text field named '" + TEXT_FIELD_ID + "'.");
textInput.setValue(SEARCH_PHRASE);
// We submit the form as if the "submit" button
// had been pressed. We expect to get a new
// document as a result.
FormInput submitInput = new FormInput("idxSearch_formSubmit", "Submit");
form.submit(new FormInput[] { submitInput });
HTMLDocumentImpl searchResultsDoc = rcontext.getCurrentDocument();
if(searchResultsDoc == null) throw new IllegalStateException("No document available for search results page.");
// Finally, we print out the search results.
NodeList resultList = (NodeList) xpath.evaluate("//a[@class='resultsLink']", searchResultsDoc, XPathConstants.NODESET);
int length = resultList.getLength();
System.out.println(length + " results found.");
for(int i = 0; i < length; i++) {
Element element = (Element) resultList.item(i);
System.out.println((i + 1) + ". " + element.getTextContent());
System.out.println(" [" + element.getAttribute("href") + "]");
}
}
private static class LocalUserAgentContext extends SimpleUserAgentContext {
@Override
public boolean isScriptingEnabled() {
// We don't need Javascript for this.
return false;
}
@Override
public boolean isExternalCSSEnabled() {
// We don't need to load remote CSS documents.
return false;
}
@Override
public String getUserAgent() {
return "Mozilla/4.0 (compatible; MSIE 6.0;) Cobra/ScrapingTest Parser Demo";
}
}
private static class LocalHtmlRendererContext extends AbstractHtmlRendererContext {
// We need a renderer context to do form submission,
// but we don't need to extend SimpleHtmlRendererContext
// which is a GUI-based context. This simple
// implementation should be enough.
private final UserAgentContext uacontext;
private HTMLDocumentImpl document;
public LocalHtmlRendererContext(final UserAgentContext uacontext) {
this.uacontext = uacontext;
}
public HTMLDocumentImpl getCurrentDocument() {
// This field is set by the local submitForm()
// implementation.
return this.document;
}
@Override
public boolean isImageLoadingDisabled() {
// We don't need to load images.
return true;
}
@Override
public UserAgentContext getUserAgentContext() {
// For consistency, this should return the
// same UserAgentContext that was used to
// parse the document.
return this.uacontext;
}
public void navigate(String urlOrPath) {
try {
// We implement a convenience navigate() method
// that is based on submitForm().
URL url = org.lobobrowser.util.Urls.guessURL(urlOrPath);
this.submitForm("GET", url, "_this", null, null);
} catch(java.net.MalformedURLException mfu) {
logger.log(Level.WARNING, "navigate()", mfu);
}
}
@Override
public void submitForm(String method, URL action, String target, String enctype, FormInput[] formInputs) {
// This is the code that does form submission.
try {
final String actualMethod = method.toUpperCase();
URL resolvedURL;
if("GET".equals(actualMethod) && formInputs != null) {
boolean firstParam = true;
StringBuffer newUrlBuffer = new StringBuffer(action.toExternalForm());
if(action.getQuery() == null) {
newUrlBuffer.append("?");
}
else {
newUrlBuffer.append("&");
}
for(int i = 0; i < formInputs.length; i++) {
FormInput parameter = formInputs[i];
String name = parameter.getName();
String encName = URLEncoder.encode(name, "UTF-8");
if(parameter.isText()) {
if(firstParam) {
firstParam = false;
}
else {
newUrlBuffer.append("&");
}
String valueStr = parameter.getTextValue();
String encValue = URLEncoder.encode(valueStr, "UTF-8");
newUrlBuffer.append(encName);
newUrlBuffer.append("=");
newUrlBuffer.append(encValue);
}
}
resolvedURL = new java.net.URL(newUrlBuffer.toString());
}
else {
resolvedURL = action;
}
URL urlForLoading;
if(resolvedURL.getProtocol().equals("file")) {
// Remove query so it works.
try {
String ref = action.getRef();
String refText = ref == null || ref.length() == 0 ? "" : "#" + ref;
urlForLoading = new URL(resolvedURL.getProtocol(), action.getHost(), action.getPort(), action.getPath() + refText);
} catch(java.net.MalformedURLException throwable) {
urlForLoading = action;
}
}
else {
urlForLoading = resolvedURL;
}
// Using potentially different URL for loading.
boolean isPost = "POST".equals(actualMethod);
URLConnection connection = urlForLoading.openConnection();
connection.setRequestProperty("User-Agent", getUserAgentContext().getUserAgent());
connection.setRequestProperty("Cookie", "");
if (connection instanceof HttpURLConnection) {
HttpURLConnection hc = (HttpURLConnection) connection;
hc.setRequestMethod(actualMethod);
// Do follow redirects
hc.setInstanceFollowRedirects(true);
}
if(isPost) {
connection.setDoOutput(true);
ByteArrayOutputStream bufOut = new ByteArrayOutputStream();
boolean firstParam = true;
if(formInputs != null) {
for(int i = 0; i < formInputs.length; i++) {
FormInput parameter = formInputs[i];
String name = parameter.getName();
String encName = URLEncoder.encode(name, "UTF-8");
if(parameter.isText()) {
if(firstParam) {
firstParam = false;
}
else {
bufOut.write((byte) '&');
}
String valueStr = parameter.getTextValue();
String encValue = URLEncoder.encode(valueStr, "UTF-8");
bufOut.write(encName.getBytes("UTF-8"));
bufOut.write((byte) '=');
bufOut.write(encValue.getBytes("UTF-8"));
}
}
}
byte[] postContent = bufOut.toByteArray();
if(connection instanceof HttpURLConnection) {
((HttpURLConnection) connection).setFixedLengthStreamingMode(postContent.length);
}
connection.setRequestProperty("Content-Type", "application/x-www-form-urlencoded");
OutputStream postOut = connection.getOutputStream();
postOut.write(postContent);
postOut.flush();
}
InputStream in = connection.getInputStream();
try {
InputStream bin = new BufferedInputStream(in, 8192);
String actualURI = urlForLoading.toExternalForm();
// Note that DocumentBuilderImpl needs to be
// constructed by passing both a UserAgentContext
// and an HtmlRendererContext in this case, so
// that form.submit() can take effect.
DocumentBuilderImpl builder = new DocumentBuilderImpl(this.uacontext, this);
String charset = org.lobobrowser.util.Urls.getCharset(connection);
InputSourceImpl is = new InputSourceImpl(bin, actualURI, charset);
this.document = (HTMLDocumentImpl) builder.parse(is);
} finally {
in.close();
}
} catch(Exception err) {
this.document = null;
logger.log(Level.WARNING, "submitForm()", err);
}
}
}
}
See Also
Support The Project
|