/* Use, modification, and distribution are subject to the Boost Software License, Version 1.0. (See accompanying file LICENSE_1.0.txt or copy at www.boost.org/LICENSE_1.0.txt) */ import java.util.*; class Limiter { protected static long LAST_PAUSE_TIME; public static long EARLIEST_CRAWL_TIME; private static Calendar calendar; public static boolean index(String url) { return url.matches(".*carleton\\.edu.*"); //return !url.matches(".*mathcs\\.carleton\\.edu.*"); //return true; } public static boolean follow(String url) { //return url.matches("http://violet\\.mathcs\\.carleton\\.edu.*"); return (url.matches("https?://[^/]*carleton\\.edu.*") && !url.matches(".*muse\\.library\\.carleton\\.edu/search/.*") && !url.matches(".*webkiosk\\.carleton\\.edu.*") && !url.matches("http://webapps.acs.carleton.edu/calendar/academic/\\?.*")); } public static boolean parse(String url) { if (url.matches(".*&amp;.*")) return false; if (url.matches(".*textonly=1.*")) return false; if (url.matches("(?i)\\Ahttps?://[^/]*\\Z")) return true; if (url.matches("(?i)\\Ahttps?://.*(htm|html|php|php3|php4|php5|pl|asp|/)(\\?[^?]*){0,1}\\Z")) return true; return false; } public static boolean fetch(String host) { if(calendar == null) { String [] ids = TimeZone.getAvailableIDs(-6 * 60 * 60 * 1000); SimpleTimeZone cdt = new SimpleTimeZone(-6 * 60 * 60 * 1000, ids[0]); cdt.setStartRule(Calendar.APRIL, 1, Calendar.SUNDAY, 2 * 60 * 60 * 1000); cdt.setEndRule(Calendar.OCTOBER, -1, Calendar.SUNDAY, 2 * 60 * 60 * 1000); calendar = new GregorianCalendar(cdt); } Date trialTime = new Date(); calendar.setTime(trialTime); if(!host.matches(".*carleton\\.edu.*")) { return true; } long h = calendar.get(Calendar.HOUR_OF_DAY); long m = calendar.get(Calendar.MINUTE); long d = calendar.get(Calendar.DAY_OF_WEEK); // peek times - all carleton servers if((d > Calendar.SUNDAY && d < Calendar.SATURDAY) && (h >= 8 && h < 17)) { LAST_PAUSE_TIME = System.currentTimeMillis(); if(17 < EARLIEST_CRAWL_TIME) EARLIEST_CRAWL_TIME = 17; return false; } // maintenance // if(d == calendar.FRIDAY && h >= 21) { // LAST_PAUSE_TIME = System.currentTimeMillis(); // return false; // } // if((d == calendar.SATURDAY || d == calendar.SUNDAY) && (h >= 12 && h < 17)) { // LAST_PAUSE_TIME = System.currentTimeMillis(); // return false; // } LAST_PAUSE_TIME = -1; return true; } public static boolean isEarliestCrawlTime() { if(calendar == null) { String [] ids = TimeZone.getAvailableIDs(-6 * 60 * 60 * 1000); SimpleTimeZone cdt = new SimpleTimeZone(-6 * 60 * 60 * 1000, ids[0]); cdt.setStartRule(Calendar.APRIL, 1, Calendar.SUNDAY, 2 * 60 * 60 * 1000); cdt.setEndRule(Calendar.OCTOBER, -1, Calendar.SUNDAY, 2 * 60 * 60 * 1000); calendar = new GregorianCalendar(cdt); } Date trialTime = new Date(); calendar.setTime(trialTime); long h = calendar.get(Calendar.HOUR_OF_DAY); long m = calendar.get(Calendar.MINUTE); long d = calendar.get(Calendar.DAY_OF_WEEK); if (h >= EARLIEST_CRAWL_TIME) { return true; } else { return false; } } }