Skip to content

Commit

Permalink
[NUTCH-3103] Fixed custom max intervals for AdaptiveFetchSchedule
Browse files Browse the repository at this point in the history
1) The loop in setHostSpecificIntervals is cleaned up and if max
interval in the config is set to default, it is treated correctly.
2) The functions getMinInterval and getMaxInterval are respectively
renamed to getCustomMinInterval and getCustomMaxInterval and now return
null if no custom interval has been set for the given URL's hostname. If
one of them returns null after it is called, then the corresponding
default value will be used to bound the calculated interval.
3) The custom interval values in the config are now allowed to equal the
default values. For example, if the default min interval is 7200 then in
the config file "0", "default" and "7200" are all valid values for the
custom min interval, and they all have the same result.
4) The config file template is changed to account for these changes.
  • Loading branch information
martin committed Feb 6, 2025
1 parent b52ec90 commit 931ba17
Show file tree
Hide file tree
Showing 2 changed files with 145 additions and 96 deletions.
18 changes: 9 additions & 9 deletions conf/adaptive-host-specific-intervals.txt.template
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
# This file defines a mapping that associates specific min. and max. refetching time intervals
# to a host, that deviate from the default settings of the AdaptiveFetchSchedule class.
# This file defines a mapping that associates specific min and max refetching intervals
# with a host, that deviate from the default settings of the AdaptiveFetchSchedule class.
#
# Format: <hostname> <min_interval> <max_interval>.
# Format: <hostname> <min_interval> <max_interval>
#
# The two values will be parsed as float and should be STRICTLY between
# The two interval values will be parsed as float and should be between
# db.fetch.schedule.adaptive.min_interval and db.fetch.schedule.adaptive.max_interval.
#
# To use default values, write "default" or "0".
# The default min. is 60 (1 min) and default max. is 31536000 (1 year).
# To use the default as a value, write either "default" or "0".
# The default min is 60 (1 min), while the default max is 31536000 (1 year).
#
www.apache.org default 1728000
www.example.org 1296000 0
nutch.apache.org 864000 2160000
www.example.com default 1728000
www.apache.org 1296000 0
nutch.apache.org 864000 2160000
223 changes: 136 additions & 87 deletions src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,12 @@
* production system.
* </p>
*
* The class also allows specifying custom min. and max. re-fetch intervals per
* hostname, in adaptive-host-specific-intervals.txt. If they are specified,
* the calculated re-fetch interval for a URL matching the hostname will not be
* allowed to fall outside of the corresponding range, instead of the default
* range.
*
* @author Andrzej Bialecki
*/
public class AdaptiveFetchSchedule extends AbstractFetchSchedule {
Expand All @@ -89,9 +95,9 @@ public class AdaptiveFetchSchedule extends AbstractFetchSchedule {

private Configuration conf;

private Map<String,Float> hostSpecificMaxInterval = new HashMap<>();
private Map<String, Float> hostSpecificMaxInterval = new HashMap<>();

private Map<String,Float> hostSpecificMinInterval = new HashMap<>();
private Map<String, Float> hostSpecificMinInterval = new HashMap<>();

@Override
public void setConf(Configuration conf) {
Expand All @@ -109,18 +115,24 @@ public void setConf(Configuration conf) {
"db.fetch.schedule.adaptive.sync_delta_rate", 0.2f);
try {
setHostSpecificIntervals("adaptive-host-specific-intervals.txt",
MIN_INTERVAL, MAX_INTERVAL);
} catch (IOException e){
LOG.error("Failed reading the configuration file. ", e);
MIN_INTERVAL, MAX_INTERVAL);
} catch (IOException e) {
LOG.error("Failed reading the configuration file: " + e.toString());
}
}

/**
* Load host-specific min_intervals and max_intervals
* from the configuration file into the HashMaps.
* Load host-specific minimal and maximal refetch intervals from
* the configuration file into the corresponding HashMaps.
*
* @param fileName the name of the configuration file containing
* the specific intervals
* @param defaultMin the value of the default min interval
* @param defaultMax the value of the default max interval
*/
private void setHostSpecificIntervals(String fileName,
float defaultMin, float defaultMax) throws IOException {
float defaultMin, float defaultMax) throws IOException {
// Setup for reading the config file.
Reader configReader = null;
configReader = conf.getConfResourceAsReader(fileName);
if (configReader == null) {
Expand All @@ -129,67 +141,105 @@ private void setHostSpecificIntervals(String fileName,
BufferedReader reader = new BufferedReader(configReader);
String line;
int lineNo = 0;

// Read the file line by line.
while ((line = reader.readLine()) != null) {
lineNo++;
if (StringUtils.isNotBlank(line) && !line.startsWith("#")) {
line = line.trim();
String[] parts = line.split("\\s+");
if (parts.length == 3) {
// TODO: Maybe add host validatio here?
// It might get computationally expensive for large files, though.
String host = parts[0].trim().toLowerCase();
String minInt = parts[1].trim();
String maxInt = parts[2].trim();
if (minInt.equalsIgnoreCase("default")){ minInt = "0"; }
if (maxInt.equalsIgnoreCase("default")){ maxInt = "0"; }
float m,M;
try {
m = Float.parseFloat(minInt);
M = Float.parseFloat(maxInt);

//negative values and mismatched boundaries are ignored
//(default to global settings)
if (m < 0 || M < 0 || m > M){
LOG.error("Improper fetch intervals given on line " + String.valueOf(lineNo)
+ " in the config. file: " + line);
} else {

// min. interval should be positive and above the global minimum
if (m > 0 && m > defaultMin){
hostSpecificMinInterval.put(host,m);
LOG.debug("Added custom min. interval " + m + " for host " + host + ".");
} else if (m > 0) {
LOG.error("Min. interval out of bounds on line " + String.valueOf(lineNo)
+ " in the config. file: " + line);
}

// max. interval should be positive and below the global maximum
if (M > 0 && M < defaultMax){
hostSpecificMaxInterval.put(host,M);
LOG.debug("Added custom max. interval " + M + " for host " + host + ".");
} else if (M > 0){
LOG.error("Max. interval out of bounds on line " + String.valueOf(lineNo)
+ " in the config. file: " + line);
}

// zero values are ignored (default to global settings)
}
} catch (NumberFormatException e){
LOG.error("No proper fetch intervals given on line " + String.valueOf(lineNo)
+ " in the config. file: " + line, e);
}
} else {
LOG.error("Malformed (domain, min_interval, max_interval) triplet on line "
+ String.valueOf(lineNo) + " of the config. file: " + line);
}

// Skip blank lines and comments.
if (StringUtils.isBlank(line) || line.startsWith("#")) {
continue;
}

// Trim and partition the line.
line = line.trim();
String[] parts = line.split("\\s+");

// There should be three parts.
if (parts.length != 3) {
LOG.error("Malformed (domain, min_interval, max_interval) triplet on line "
+ String.valueOf(lineNo) + " of config. file: `" + line + "`");
continue;
}

// Normalize the parts.
String host = parts[0].trim().toLowerCase();
String minInt = parts[1].trim();
String maxInt = parts[2].trim();

// "0" and "default" both mean `use default interval`; normalize to "0".
if (minInt.equalsIgnoreCase("default")) { minInt = "0"; }
if (maxInt.equalsIgnoreCase("default")) { maxInt = "0"; }

// Convert intervals to float and ignore the line in case of failure.
float m, M;
try {
m = Float.parseFloat(minInt);
M = Float.parseFloat(maxInt);
} catch (NumberFormatException e) {
LOG.error("Improper fetch intervals given on line " + String.valueOf(lineNo)
+ " of config. file: `" + line + "`: " + e.toString());
continue;
}

// If both intervals are set to default,
// ignore the line and issue a warning.
if (m == 0 && M == 0) {
LOG.warn("Ignoring default interval values on line " + String.valueOf(lineNo)
+ " of config. file: `" + line + "`");
continue;
}

// Replace the zero with the default value.
if (m == 0) {
m = defaultMin;
} else if (M == 0) {
M = defaultMax;
}

// Intervals cannot be negative and the min cannot be above the max
// (we assume here that the default values satisfy this).
if (m < 0 || M < 0) {
LOG.error("Improper fetch intervals given on line " + String.valueOf(lineNo)
+ " of config. file: `" + line
+ "`: intervals cannot be negative");
continue;
}

if (m > M) {
LOG.error("Improper fetch intervals given on line " + String.valueOf(lineNo)
+ " of config. file: `" + line
+ "`: min. interval cannot be above max. interval");
continue;
}

// The custom intervals should respect the boundaries of the default values.
if (m < defaultMin) {
LOG.error("Min. interval out of bounds on line " + String.valueOf(lineNo)
+ " of config. file: `" + line + "`");
continue;
}

if (M > defaultMax) {
LOG.error("Max. interval out of bounds on line " + String.valueOf(lineNo)
+ " of config. file: `" + line + "`");
continue;
}

// If all is well, store the specific intervals.
hostSpecificMinInterval.put(host, m);
LOG.debug("Added custom min. interval " + m + " for host " + host);

hostSpecificMaxInterval.put(host, M);
LOG.debug("Added custom max. interval " + M + " for host " + host);

}
}

/**
* Strip a URL, leaving only the host name.
* Strip a URL, leaving only the hostname.
*
* @param url url to get hostname for
* @param url the URL for which to get the hostname
* @return hostname
* @throws URISyntaxException if the given string violates RFC 2396
*/
Expand All @@ -200,49 +250,49 @@ public static String getHostName(String url) throws URISyntaxException {
}

/**
* Returns the max_interval for this URL, which might depend on the host.
* Returns the custom max. refetch interval for this URL,
* if specified for the corresponding hostname.
*
* @param url the URL to be scheduled
* @param defaultMaxInterval the value to which to default if max_interval has not been configured for this host
* @return the configured maximum interval or the default interval
* @return the configured max. interval or null
*/
public float getMaxInterval(Text url, float defaultMaxInterval){
public Float getCustomMaxInterval(Text url) {
if (hostSpecificMaxInterval.isEmpty()) {
return defaultMaxInterval;
return null;
}
String host;
try {
host = getHostName(url.toString());
} catch (URISyntaxException e){
return defaultMaxInterval;
return null;
}
if (hostSpecificMaxInterval.containsKey(host)){
return hostSpecificMaxInterval.get(host);
if (!hostSpecificMaxInterval.containsKey(host)) {
return null;
}
return defaultMaxInterval;
return hostSpecificMaxInterval.get(host);
}

/**
* Returns the min_interval for this URL, which might depend on the host.
* Returns the custom min. refetch interval for this URL,
* if specified for the corresponding hostname.
*
* @param url the URL to be scheduled
* @param defaultMinInterval the value to which to default if min_interval has not been configured for this host
* @return the configured minimum interval or the default interval
* @return the configured min. interval or null
*/
public float getMinInterval(Text url, float defaultMinInterval){
public Float getCustomMinInterval(Text url) {
if (hostSpecificMinInterval.isEmpty()) {
return defaultMinInterval;
return null;
}
String host;
try {
host = getHostName(url.toString());
} catch (URISyntaxException e){
return defaultMinInterval;
return null;
}
if (hostSpecificMinInterval.containsKey(host)){
return hostSpecificMinInterval.get(host);
if (!hostSpecificMinInterval.containsKey(host)) {
return null;
}
return defaultMinInterval;
return hostSpecificMinInterval.get(host);
}

@Override
Expand Down Expand Up @@ -285,14 +335,13 @@ public CrawlDatum setFetchSchedule(Text url, CrawlDatum datum,
refTime = fetchTime - Math.round(delta * SYNC_DELTA_RATE * 1000);
}

// replace min_interval and max_interval with a domain-specific ones,
// if so configured.
float newMaxInterval = getMaxInterval(url, MAX_INTERVAL);
float newMinInterval = getMinInterval(url, MIN_INTERVAL);
if (interval < newMinInterval) {
interval = newMinInterval;
} else if (interval > newMaxInterval) {
interval = newMaxInterval;
// Ensure the interval does not fall outside of bounds
float minInterval = (getCustomMinInterval(url) != null) ? getCustomMinInterval(url) : MIN_INTERVAL;
float maxInterval = (getCustomMaxInterval(url) != null) ? getCustomMaxInterval(url) : MAX_INTERVAL;
if (interval < minInterval) {
interval = minInterval;
} else if (interval > maxInterval) {
interval = maxInterval;
}
}

Expand Down

0 comments on commit 931ba17

Please sign in to comment.