/* Use, modification, and distribution are subject to the Boost Software License, Version 1.0. (See accompanying file LICENSE_1.0.txt or copy at www.boost.org/LICENSE_1.0.txt) */ /* PageRank - new Test version Revision 2.4 2/17/04 */ import java.util.*; import java.sql.*; import java.io.*; import java.text.DecimalFormat; class PageRankTest { public static int ITER = 50; // 50 sounds like a good amount /* method for resetting all of the ranks back to 1.0 without performing another crawl. Mainly for testing purposes. As a ranking only needs to be run once per crawler, there should be no reason to reset the ranks between crawls */ private static void resetRanks(Connection conn) throws SQLException { ResultSet rs; Statement stmt; stmt = conn.createStatement(ResultSet.TYPE_SCROLL_INSENSITIVE, ResultSet.CONCUR_UPDATABLE); rs = stmt.executeQuery("Select urlid, rank From url"); while(rs.next()) { rs.updateFloat(2,(float)1.0); rs.updateRow(); } } /* the big ranking algorithm. Rank for a page is calculated through a simple formula, that is rather complex to implement. The formula is: rank(x)= 0.15 + 0.85 * The sum of (rank/number of links) for all pages that link to x. For optimization, I perform all SQL database lookups before performing any calculations. */ private static void pageRank(Connection conn) throws SQLException { ResultSet rs1, rs2; Statement stmt,stmt2; float rank, new_rank; int count, id; float[] ranks = new float[55000]; int[] counts = new int[55000]; int i; float j; Integer value; Iterator it; HashSet links[] = new HashSet[55000]; float [][] changes = new float[55000][50]; // initializing the HashSet array that will contain all of the // pages linked to a page for (i = 1; i < 55000; i++) { links[i] = new HashSet(); } stmt = conn.createStatement(ResultSet.TYPE_SCROLL_INSENSITIVE, ResultSet.CONCUR_UPDATABLE); stmt2 = conn.createStatement(ResultSet.TYPE_SCROLL_INSENSITIVE, ResultSet.CONCUR_UPDATABLE); rs1 = stmt.executeQuery("Select lurl, count(lurl) as num from link Group by lurl"); int temp; // initializing array counts[] to the number of links on each page while(rs1.next()) { temp = rs1.getInt(1); counts[temp] = rs1.getInt(2); } /* initializing array ranks[] to the rank of each page. If a page is a dead link or should otherwise be skipped, its rank is set to zero. */ rs1 = stmt.executeQuery("Select urlid From url_to_text"); for(i = 0; i < 55000; i++) { ranks[i] = 0; } while(rs1.next()) { temp = rs1.getInt(1); ranks[temp] = 1; } /* each HashSet i in links contains the ids of the pages that link to i */ for(i = 0; i < 55000; i++) { j = ranks[i]; if(j != 0) { rs2 = stmt.executeQuery("Select lurl From link where urlid = " + i ); while(rs2.next()) { temp = rs2.getInt(1); value = new Integer(temp); links[i].add(value); } System.out.println(i); } } /* actually performing the calculations. Skips any pages where count is zero to avoid division erros, and skips any pages that have no rank, as they are most likely dead links */ for(int k = 1; k <= ITER; k++) { for(i = 0; i < 55000; i++) { j = ranks[i]; if(j != 0) { new_rank = (float)0.15; it = links[i].iterator(); while(it.hasNext()) { value = (Integer)it.next(); id = value.intValue(); count = counts[id]; rank = ranks[id]; if (count != 0) { new_rank += 0.85 * (rank / count); System.out.println(rank + "\t" + count + "\t" + new_rank); } } ranks[i] = new_rank; changes[i][k-1] = new_rank; System.out.println(i); } } System.out.println("End of iteration " + k); } // writing out the new ranks to the database rs1 = stmt.executeQuery("Select urlid,rank From url"); rs1.beforeFirst(); i = 1; while(rs1.next()) { rs1.updateFloat(2,ranks[i]); rs1.updateRow(); i++; } // writing out the changes in rank to a flat file for later review outputChanges(changes); } /* method that writes out the changes to ranks over time to a text file called rankchanges.txt */ public static void outputChanges(float[][] changes) { int i,j; try { FileOutputStream out = new FileOutputStream("rankchanges.txt"); PrintStream p = new PrintStream(out); for(i = 0; i < 55000; i++) { for (j = 0; j < 49; j++) { p.print("" + changes[i][j] + " "); } p.println("" + changes[i][49]); } p.close(); } catch(Exception e) { System.err.println("Well, shit!"); } } /* testing method for printing all of the ranks */ public static void printRanks(Connection conn) throws SQLException { ResultSet rs; Statement stmt; stmt = conn.createStatement(); rs = stmt.executeQuery("Select rank From url"); while(rs.next()) { System.out.println(rs.getFloat(1)); } } public static void main(String args[]) throws SQLException { boolean commandline = false; String database = null; String option = null; if(args.length == 0) { System.out.println("ERROR: Database name must be supplied as a command-line argument."); System.exit(1); } else if(args.length == 1) { database = args[0]; } else if(args.length == 2) { option = args[0]; database = args[1]; } String answer = ""; Connection conn; try { Class.forName("com.mysql.jdbc.Driver").newInstance(); } catch(Exception ex) { System.err.println(ex); System.exit(1); } conn = DriverManager.getConnection("jdbc:mysql://violet.mathcs.carleton.edu/" + database,"webcrawler","twlv34-1"); if(option == null) { System.out.println("Calculate (c), reset (r) or display (d)?"); InputStreamReader is = new InputStreamReader(System.in); BufferedReader br = new BufferedReader(is); try { answer = br.readLine(); } catch(IOException ex) { System.err.println(ex); System.exit(1); } } else { answer = option; } if (answer.equals("r")) { resetRanks(conn); } else if (answer.equals("c")) { pageRank(conn); } else { printRanks(conn); } } }