package LinkChecker; import java.util.*; import java.net.*; import java.io.*; import javax.swing.text.*; import javax.swing.text.html.*; import com.extentech.toolkit.Logger; /** * That class implements a reusable spider. To use this * class you must have a class setup to recieve * the information found by the spider. This class must * implement the ISpiderReportable method. Written by * Jeff Heaton. Jeff Heaton is the author of "Programming * Spiders, Bots, and Aggregators" by Sybex. Jeff can be * contacted through his web site at http://www.jeffheaton.com. * * @author Jeff Heaton(http://www.jeffheaton.com) * @version 1.0 */ public class Spider { /** * A collection of URL's that resulted in an error. */ protected Collection workloadError = new ArrayList(3); /** * A collection of URL's that are waiting to * be processed. */ protected Collection workloadWaiting = new ArrayList(3); /** * A collection of URL's that were processed. */ protected Collection workloadProcessed = new ArrayList(3); /** * The class that the spider should report its * URL's to. */ protected ISpiderReportable report; /** * A flag that indicates if this process * should be canceled. */ protected boolean cancel = false; /** * The constructor. * * @param report A class that implements the ISpiderReportable interface, * that will recieve information that the spider finds. */ public Spider(ISpiderReportable report) { this.report = report; } /** * Get the URL's that resulted in an error. * * @return A collection of URL's. */ public Collection getWorkloadError() { return workloadError; } /** * Get the URL's that were waiting to be processed. * You should add one URL to this collection to * begin the spider. * * @return A collection of URL's. */ public Collection getWorkloadWaiting() { return workloadWaiting; } /** * Get the URL's that were processed by this spider. * * @return A collection of URL's. */ public Collection getWorkloadProcessed() { return workloadProcessed; } /** * Clear all of the workloads. */ public void clear() { getWorkloadError().clear(); getWorkloadWaiting().clear(); getWorkloadProcessed().clear(); } /** * Set a flag that will cause the begin * method to return before it is done. */ public void cancel() { cancel = true; } /** * Add a URL for processing. * * @param url */ public void addURL(URL url) { if ( getWorkloadWaiting().contains(url) ) return; if ( getWorkloadError().contains(url) ) return; if ( getWorkloadProcessed().contains(url) ) return; log("Adding to workload: " + url ); getWorkloadWaiting().add(url); } /** * Called internally to process a URL. * * @param url The URL to be processed. */ public void processURL(URL url) { try { log("Processing: " + url ); // get the URL's contents URLConnection connection = url.openConnection(); if ( (connection.getContentType()!=null) && !connection.getContentType().toLowerCase().startsWith("text/") ) { getWorkloadWaiting().remove(url); getWorkloadProcessed().add(url); log("Not processing because content type is: " + connection.getContentType() ); return; } // read the URL InputStream is = connection.getInputStream(); Reader r = new InputStreamReader(is); // parse the URL HTMLEditorKit.Parser parse = new HTMLParse().getParser(); parse.parse(r,new Parser(url),true); } catch ( IOException e ) { getWorkloadWaiting().remove(url); getWorkloadError().add(url); log("Error: " + url ); report.spiderURLError(url); return; } // mark URL as complete getWorkloadWaiting().remove(url); getWorkloadProcessed().add(url); log("Complete: " + url ); } /** * Called to start the spider. */ public void begin() { cancel = false; while ( !getWorkloadWaiting().isEmpty() && !cancel ) { Object list[] = getWorkloadWaiting().toArray(); for ( int i=0;(i