/* Use, modification, and distribution are subject to the Boost Software License, Version 1.0. (See accompanying file LICENSE_1.0.txt or copy at www.boost.org/LICENSE_1.0.txt) */ // Crawler Class // Main class to web-crawl // Calls Parser.parse() on URLS stored in PagesToVisit Objects import java.util.*; import java.sql.*; import java.text.DecimalFormat; import java.io.*; import java.net.URL; class PagesToVisit // {{{ { private Vector links = new Vector(); public String next() { return (String) links.remove( 0 ); } public String peek() { return (String) links.firstElement(); } public void add( String link ) { links.add( link ); } public void addAtStart( String link ) { links.add(0, link); } public boolean isEmpty() { return links.isEmpty(); } public int size() { return links.size(); } public Iterator getIterator() { return links.iterator(); } public String get(int i) { if(i < links.size()) { return (String) links.get(i); } else return null; } public String remove(int i) { if(i < links.size()) { return (String) links.remove(i); } else return null; } } // }}} class URLStream //{{{ { protected String url; protected InputStream stream; protected long fetchTime; public URLStream(String u, InputStream s, long t) { url = u; stream = s; fetchTime = t; } } //}}} class Crawler // {{{ { // Variable Declaration for Crawler {{{ //First off, lets have some debug variables. Basically, set these to true to output a particular thing. private boolean debug_fetch = true; private boolean debug_fetcherror = true; private boolean debug_checksum = true; private boolean debug_parseerror = true; private boolean debug_unknown = true; private boolean debug_inserttimes = true; private boolean debug_indexing = true; private int indexedCount; private int checkCount; private int errorCount; private int fetchErrorCount; private int parseErrorCount; private int noFollowCount; private int SQLErrorCount; private double startTime; private Connection conn; private boolean quiet; private int cacheHits; private int cacheMisses; private int dbLookups; private int lcacheHits; private int lcacheMisses; private int ldbLookups; private boolean dbl; private WordCache wordCache; private LinkCache linkCache; private Quarantine quarantine; private PagesToVisit toDo; private PagesToVisit toDoLater; private Statement stmt; private ResultSet rs; private String tempWord; private HashSet checkHash; private long getUID = 0; private long gU1 = 0; private long gU2 = 0; private long gU3 = 0; private long gU4 = 0; private long gU5 = 0; // constants private static final String filename = "state.dat"; private static final String separator = "9999-9999?9999-9999"; private static final long PARSETIMEOUT = 30000; //}}} // Batch Statements Stuff {{{ private BatchStatement stmtWordInsert; private PreparedStatement stmtWIDQuery; private BatchStatement stmtURLInsert; private PreparedStatement stmtUIDQuery; private BatchStatement stmtWord_to_URL; private BatchStatement stmtLink; private BatchStatement stmtWord_to_Link; private PreparedStatement stmtUrl_to_Text; private void prepareStmts() { try { if(conn != null) { stmtWordInsert = new BatchStatement(conn.createStatement()); stmtWIDQuery = conn.prepareStatement("SELECT * FROM word WHERE word = ? ;"); stmtURLInsert = new BatchStatement(conn.createStatement()); stmtWord_to_URL = new BatchStatement(conn.createStatement()); stmtLink = new BatchStatement(conn.createStatement()); stmtWord_to_Link = new BatchStatement(conn.createStatement()); stmtUrl_to_Text = conn.prepareStatement("Insert into url_to_text (urlid,header,text) VALUES(?, ?, ?);"); } } catch (SQLException sql) { System.err.println(sql); System.exit(1); } } private void commitStmts() throws SQLException { stmtWordInsert.executeBatch(); stmtURLInsert.executeBatch(); stmtWord_to_URL.executeBatch(); stmtLink.executeBatch(); stmtWord_to_Link.executeBatch(); }//}}} class MyShutdown extends Thread //{{{ { public void run() { crawling = false; while(myRetriever != null && (myRetriever.isAlive() || !doneCrawling)) { try { Thread.sleep(100); } catch (InterruptedException ie) {} } if(toDo.size() > 0 || toDoLater.size() > 0 || URLStreamList.size() > 0) { System.out.print("\nSave crawl state? (y/n): "); try { BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); String response = in.readLine(); if(response.toLowerCase().startsWith("y")) { writeState(); } } catch(IOException ioe) { System.err.println("Error: " + ioe); } } printInfo(); } } //}}} class ShowSystemState extends Thread //{{{ { public void run() { while (crawling) { System.out.println("\n\n#####################################################"); System.out.println("Current System State Info"); System.out.println("#####################################################\n"); System.out.println("\nState time: "+((System.currentTimeMillis()-startTime)/60000)+"\n"); System.out.println("Size of ToDo: "+toDo.size()); System.out.println("Size of ToDoLater: "+toDoLater.size()); System.out.println("Size of URLStreamList: "+URLStreamList.size()+"\n"); System.out.println("WordInsert BatchSize: "+stmtWordInsert.batchSize()); System.out.println("URLInsert BatchSize: "+stmtURLInsert.batchSize()); System.out.println("Word_to_URL BatchSize: "+stmtWord_to_URL.batchSize()); System.out.println("Link BatchSize: "+stmtLink.batchSize()); System.out.println("Word_to_Link BatchSize: "+stmtWord_to_Link.batchSize()+"\n"); System.out.println("Pages Indexed: "+indexedCount); System.out.println("Checksum Collisions: "+checkCount); System.out.println("Errors: "+errorCount); System.out.println("SQL Errors: "+SQLErrorCount); System.out.println("Fetch Errors: "+fetchErrorCount); System.out.println("Parse Errors: "+parseErrorCount); System.out.println("Links not followed: "+noFollowCount+"\n"); System.out.println("Unique Words: "+cacheMisses); System.out.println("Word Cache Hits:"+cacheHits); System.out.println("Word DB Lookups:"+dbLookups+"\n"); System.out.println("Unique URLs: "+lcacheMisses); System.out.println("Link Cache Hits:"+lcacheHits+"\n"); System.out.println("#####################################################\n\n"); try { Thread.sleep(600000); } catch (InterruptedException ie) { } } } } //}}} private ShowSystemState mySystemState; class ParserThread extends Thread //{{{ { private InputStream myStream; private String curUrl; protected Page page; public ParserThread(String curUrl, InputStream stream) { myStream = stream; this.curUrl = curUrl; page = null; } public void run() { page = Parser.parse(curUrl, myStream); } }//}}} // PageRetriever Stuff {{{ private boolean crawling; private boolean doneCrawling; private Vector URLStreamList = new Vector(); // Retrieves pages in the background and adds them to the URLStreamList vector. // Pops urls off the beginning of the toDo vector to decide what to download. class PageRetriever extends Thread { private int MAX_STREAMLIST_SIZE = 500; public void run() { long t1, time; InputStream stream; String url; HashMap hosts = new HashMap(); long delay_ms; String prevhost = ""; String host = ""; while(crawling) { while(((toDo.isEmpty() && toDoLater.isEmpty()) || URLStreamList.size() >= MAX_STREAMLIST_SIZE) && crawling) { try { Thread.sleep(100); } catch(InterruptedException ie) {} } boolean fromLater = false; url = null; if(!toDoLater.isEmpty() && System.currentTimeMillis() - Limiter.LAST_PAUSE_TIME >= 60000) { for(int i=0; i 1) { linkCache.put(s[0], Integer.parseInt(s[1])); } line = in.readLine(); } System.out.println("\tCheckHash..."); // checkHash line = in.readLine(); while(line != null && !line.equals(separator)) { checkHash.add(new Long(line)); line = in.readLine(); } in.close(); } catch (IOException ioe) { System.err.println("Error reading data from " + filename + " :"); System.err.println(ioe); System.exit(1); } catch (SQLException sql) { System.err.println(sql); } } //}}} class Quarantine //{{{ { protected HashMap qHash; public Quarantine() { qHash = new HashMap(100, (float) 1.00); } public boolean isIn(String url) { return qHash.containsKey(url); } public void add(Link newLink) { Vector links; if (!qHash.containsKey(newLink.getURL())) { links = new Vector(); } else { links = (Vector) qHash.get(newLink.getURL()); } links.add(newLink); qHash.put(newLink.getURL(),links); } public void release(String url) throws SQLException { if (qHash.containsKey(url)) { Vector links = (Vector) qHash.get(url); Iterator it = links.iterator(); Link currLink; while(it.hasNext()) { currLink = (Link) it.next(); linkToDB(currLink); } qHash.remove(url); } } public void kill(String url) { if (qHash.containsKey(url)) { qHash.remove(url); } } } //}}} class WordCache //{{{ { private static final int MAX_HASH_SIZE = 250000; private boolean max_size_reached; private int size; protected HashMap wordHash; protected int curWID; private String mru; public WordCache() { wordHash = new HashMap(MAX_HASH_SIZE, (float) 1.00); max_size_reached = false; size = 0; curWID = 1; cacheHits = 0; cacheMisses = 0; dbLookups = 0; } public void setCurWID(int cur) { curWID = cur; } public Iterator getIterator() { return wordHash.keySet().iterator(); } private void replace(String key, int value) { // most recently used replacement algorithm // wordHash.remove(mru); // wordHash.put(key, new Integer(value)); } public void put(String key, int value) throws SQLException { if(max_size_reached) { replace(key, value); } else { Object test = wordHash.put(key, new Integer(value)); size++; if(size >= MAX_HASH_SIZE) { max_size_reached = true; stmtWordInsert.executeBatch(); } } } public int getWID(String key) throws SQLException //{{{ { ResultSet rs = null; int ret; if(wordHash.containsKey(key)) { ret = ((Integer) wordHash.get(key)).intValue(); cacheHits++; } else if(!max_size_reached) { // the word is not in the hash and the hash is not full // therefore, the word is not in the database, so add it curWID++; stmtWordInsert.addBatch("INSERT INTO word (word,wid) VALUES('" + key + "'," + (curWID-1) + ");" ); put(key, curWID-1); ret = curWID - 1; cacheMisses++; } else { // the hash is full, so we need to check the database //rs = stmt.executeQuery("Select * from word where word = '" + key + "';"); stmtWIDQuery.setString(1, "'"+key+"'"); rs = stmtWIDQuery.executeQuery(); rs.last(); dbLookups++; if( rs.getRow() == 0 ) { // the word is not in the database, so insert it stmtWordInsert.addBatch("INSERT INTO word (word,wid) VALUES('"+key+"'," + curWID + ");" ); // add the word and wid to the cache, replacing something put(key, curWID); curWID++; ret = curWID - 1; } else { // the word is in the database, get the WID and add it to the cache int wid = rs.getInt(1); put(key, wid); ret = wid; } } mru = key; return ret; } //}}} } //}}} private class LinkCache //{{{ { private int size; protected HashMap linkHash; protected int curUID; public LinkCache() { linkHash = new HashMap(160000, (float) 1.00); size = 0; curUID = 1; lcacheHits = 0; lcacheMisses = 0; ldbLookups = 0; } public void setCurUID(int cur) { curUID = cur; } public Iterator getIterator() { return linkHash.keySet().iterator(); } private void put(String key, int value) { linkHash.put(key, new Integer(value)); size++; } public boolean isIn( String key ) { return linkHash.containsKey( key ); } public int getUID(String key) throws SQLException //{{{ { long t3; dbl = false; ResultSet rs = null; int ret; t3 = System.currentTimeMillis(); if(linkHash.containsKey(key)) { ret = ((Integer) linkHash.get(key)).intValue(); lcacheHits++; gU1 = System.currentTimeMillis() - t3; } else { t3 = System.currentTimeMillis(); // the url is not in the hash and the hash is not full // therefore, the url is not in the database, so add it stmtURLInsert.addBatch("INSERT INTO url (url,urlid) VALUES('" + key + "'," + curUID + ");" ); put(key, curUID); curUID++; ret = curUID - 1; lcacheMisses++; gU2 = System.currentTimeMillis() - t3; } return ret; } //}}} } //}}} // Database Insertion functions {{{ // wordToDB Stuff {{{ private String [] tagArray = { "b","i","u","h1","h2","h3","h4","h5","h6","img","title","meta" }; private void wordToDB(int wid, int urlid, Integer pos, String bintags) throws SQLException { String tags = ""; int i = 0; for(i=0; i<12; i++) { if( bintags.charAt(i) == '1') break; } if(i<12) { tags = tags.concat(tagArray[i]); for(int j=i;j<12;j++) { if(bintags.charAt(j) == '1') { tags = tags.concat(","+tagArray[j]); } } } stmtWord_to_URL.addBatch("Insert into word_to_url (wid,urlid,pos,tag) VALUES("+wid+","+urlid+","+pos+",\""+tags+"\");" ); } //}}} private void linkToDB(Link curLink) throws SQLException // {{{ { Iterator detailink = curLink.getPositions(); int urlid = linkCache.getUID(curLink.getURL()); int pageid = linkCache.getUID(curLink.getSourceURL()); wordsToLinkTable(curLink, urlid, pageid); while( detailink.hasNext() ) { Integer pos = (Integer) detailink.next(); //System.out.println("INSERT into link: "+urlid+" "+pos+" "+pageid); stmtLink.addBatch("INSERT INTO link (lurl,pos,urlid) VALUES("+urlid+","+pos+","+pageid+");" ); } } // }}} private void wordsToLinkTable(Link l, int urlid, int pageid) throws SQLException //{{{ { Iterator wordit = l.getWords(); while( wordit.hasNext() ) { String curWordString = (String) wordit.next(); Word curWord = (Word) l.wordList.get(curWordString); int wid = wordCache.getWID(curWordString); Iterator detailit = curWord.getDetails(); while( detailit.hasNext() ) { Integer pos = (Integer) detailit.next(); stmtWord_to_Link.addBatch("INSERT INTO word_to_link (lid,pos,urlid,wid) VALUES("+urlid+","+pos+","+pageid+","+wid+");"); wordToDB(wid, pageid, pos, curWord.posToTags(pos)); } } } //}}} //}}} public void crawl( String[] args ) // {{{ { // Variable Declaration for crawl() {{{ doneCrawling = true; String database = new String(); MyShutdown sh = new MyShutdown(); Runtime.getRuntime().addShutdownHook(sh); startTime = System.currentTimeMillis(); quiet = false; if(args.length > 0) { if(args[0].equals("-q")) { quiet = true; String [] temp = new String[args.length - 1]; for(int i=1; i i + 1) { database = args[i+1]; String [] temp = new String[args.length - 2]; for(int j=0; j PARSETIMEOUT) { pthread.interrupt(); System.out.println("Parser Timeout: "+curUrl); break; } try { Thread.sleep(10); } catch (InterruptedException ie) { } } p = pthread.page; parseTime = System.currentTimeMillis() - t1; } else p = null; // }}} // Process the page object {{{ if( p != null && !checkHash.contains(new Long(p.checkSum))) { quarantine.release(p.getURL()); if (debug_indexing) { if(!Limiter.index(curUrl)) { System.out.print("Not indexing: "); } if(!Limiter.follow(curUrl)) { System.out.print("Not following: "); } } t1 = System.currentTimeMillis(); checkHash.add(new Long(p.checkSum)); indexedCount++; if( p.index ) { urlid = linkCache.getUID(p.getURL()); //grab text and header info try { stmtUrl_to_Text.setInt(1, urlid); stmtUrl_to_Text.setString(2,p.getHeader()); stmtUrl_to_Text.setString(3, p.getText()); stmtUrl_to_Text.executeUpdate(); } catch (SQLException exer) { System.err.println("SQLException: " + exer.getMessage() ); System.err.println("SQLState: " + exer.getSQLState() ); System.err.println("VenderError: " + exer.getErrorCode() ); // open a connection to mysql conn = DriverManager.getConnection("jdbc:mysql://violet.mathcs.carleton.edu/" + database,"webcrawler","twlv34-1"); prepareStmts(); stmtUrl_to_Text.setInt(1, urlid); stmtUrl_to_Text.setString(2,p.getHeader()); stmtUrl_to_Text.setString(3, p.getText()); stmtUrl_to_Text.executeUpdate(); } // go through the words on the page Iterator wordit = p.getWords(); while( wordit.hasNext() ) { curWord = p.keyToWord( (String) wordit.next() ); tempWord = curWord.getWord(); wid = wordCache.getWID(tempWord); // get the info for each word-occurence Iterator detailit = curWord.getDetails(); while( detailit.hasNext() ) { // get the position and the tags for // the appearance of the word pos = (Integer) detailit.next(); bintags = curWord.posToTags( pos ); // add the word to the database wordToDB(wid, urlid, pos, bintags); } } } // add the links on the page to the queue and the hash Iterator it = p.getLinks(); while( it.hasNext() ) { String s = (String) it.next(); if (p.follow && (Limiter.index(s) || Limiter.follow(s))) { if( !linkCache.isIn( s ) ) { if(!quarantine.isIn(s)) { toDo.add( s ); } quarantine.add(p.keyToLink(s)); } else { linkToDB(p.keyToLink(s)); } } else { noFollowCount++; } } insertTime = System.currentTimeMillis() - t1; } else { quarantine.kill(curUrl); if(p != null) { if (debug_checksum) System.err.println( " CheckSum Collision: " + curUrl ); checkCount++; } else if(stream != null) { if (debug_parseerror) System.err.println( " Could not parse: " + curUrl ); parseErrorCount++; } else { if (debug_unknown) System.err.println( " Unknown Error: " + curUrl); errorCount++; } } // }}} if (debug_inserttimes) System.out.println("I: "+insertTime+" ms dbl: "+dbl); } catch(BatchUpdateException bue) { // {{{ SQLErrorCount++; System.err.println(bue); System.err.println("SQLException: " + bue.getMessage() ); System.err.println("SQLState: " + bue.getSQLState() ); System.err.println("VenderError: " + bue.getErrorCode() ); System.err.println("curUrl: " + curUrl ); System.err.println("curWord: " + tempWord ); bue.printStackTrace(System.err); } catch (SQLException ex) { SQLErrorCount++; System.err.println("SQLException: " + ex.getMessage() ); System.err.println("SQLState: " + ex.getSQLState() ); System.err.println("VenderError: " + ex.getErrorCode() ); System.err.println("curUrl: " + curUrl ); System.err.println("curWord: " + tempWord ); ex.printStackTrace(System.err); } // }}} } // }}} System.out.println("Indexer Done"); // Start Shutdown sequence after a crawl {{{ crawling = false; mySystemState.interrupt(); if( rs != null ) { try { rs.close(); } catch (SQLException sqlEx) {} rs = null; } if( stmt != null ) { try { stmt.close(); } catch (SQLException sqlEx) {} stmt = null; } try { commitStmts(); } catch(SQLException bue) { System.err.println(bue); } doneCrawling = true; //}}} } // }}} private void printInfo() { //{{{ int hours; int minutes; int seconds; double elapsedTime = System.currentTimeMillis() - startTime; double totalTime = elapsedTime; hours = (int) Math.floor(elapsedTime / 3600000); elapsedTime = elapsedTime - hours * 3600000; minutes = (int) Math.floor(elapsedTime / 60000); elapsedTime = elapsedTime - minutes * 60000; seconds = (int) Math.floor(elapsedTime / 1000); DecimalFormat myFormatter = new DecimalFormat("##"); DecimalFormat myFormatter2 = new DecimalFormat("###"); System.out.println(indexedCount + " pages crawled."); System.out.println(fetchErrorCount + " pages could not be fetched."); System.out.println(checkCount + " pages had checksum collisions."); System.out.println(parseErrorCount + " pages could not be parsed."); System.out.println(errorCount + " pages had unknown errors."); System.out.println(noFollowCount + " links not followed."); if(!toDo.isEmpty()) System.out.println((toDo.size() + toDoLater.size() + URLStreamList.size()) + " links remaining on the queue."); System.out.println(myFormatter2.format(hours) + " h " + myFormatter.format(minutes) + " m " + myFormatter.format(seconds) + " s"); System.out.println("Avg of " + Math.round(totalTime / indexedCount) + " ms per page."); System.out.println("\n" + cacheHits + " cache hits."); System.out.println(cacheMisses + " cache misses."); System.out.println(dbLookups + " db lookups."); System.out.println("\n" + lcacheHits + " link cache hits."); System.out.println(lcacheMisses + " link cache misses."); System.out.println(ldbLookups + " link db lookups."); int total = cacheHits + cacheMisses + dbLookups; System.out.println("Total: " + total); } //}}} public static void main( String[] args ) { Crawler c = new Crawler(); c.crawl(args); } } // }}}