-
Notifications
You must be signed in to change notification settings - Fork 194
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
ad85347
commit 6a0a877
Showing
44 changed files
with
6,744 additions
and
6,744 deletions.
There are no files selected for viewing
130 changes: 65 additions & 65 deletions
130
trunk/MRDP/src/main/java/mrdp/MRDPMain.java → MRDP/src/main/java/mrdp/MRDPMain.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,65 +1,65 @@ | ||
package mrdp; | ||
|
||
import java.util.Arrays; | ||
|
||
import mrdp.ch1.*; | ||
import mrdp.ch2.*; | ||
import mrdp.ch3.*; | ||
import mrdp.ch4.*; | ||
import mrdp.ch5.*; | ||
import mrdp.ch6.*; | ||
import mrdp.ch7.*; | ||
import mrdp.utils.MRDPUtils; | ||
|
||
import org.apache.hadoop.conf.Configuration; | ||
import org.apache.hadoop.conf.Configured; | ||
import org.apache.hadoop.util.Tool; | ||
import org.apache.hadoop.util.ToolRunner; | ||
|
||
@SuppressWarnings("unused") | ||
public class MRDPMain extends Configured implements Tool { | ||
|
||
public static void main(String[] args) throws Exception { | ||
System.exit(ToolRunner.run(new Configuration(), new MRDPMain(), args)); | ||
} | ||
|
||
@Override | ||
public int run(String[] args) throws Exception { | ||
if (args.length > 0) { | ||
String example = args[0]; | ||
String[] otherArgs = Arrays.copyOfRange(args, 1, args.length); | ||
|
||
if (example.equalsIgnoreCase("PartitionPruningOutput")) { | ||
PartitionPruningOutputDriver.main(otherArgs); | ||
} else if (example.equalsIgnoreCase("PartitionPruningInput")) { | ||
PartitionPruningInputDriver.main(otherArgs); | ||
} else if (example.equalsIgnoreCase("RedisInput")) { | ||
RedisInputDriver.main(otherArgs); | ||
} else if (example.equalsIgnoreCase("RedisOutput")) { | ||
RedisOutputDriver.main(otherArgs); | ||
} else { | ||
printHelp(); | ||
return 1; | ||
} | ||
|
||
return 0; | ||
} else { | ||
printHelp(); | ||
return 1; | ||
} | ||
} | ||
|
||
private void printHelp() { | ||
System.out | ||
.println("Usage: hadoop jar mrdp.jar <example> <example args>"); | ||
System.out.println("Examples are:"); | ||
System.out.println("Chapter 7:"); | ||
System.out | ||
.println("\tRedisOutput <user data> <redis hosts> <hashset name>"); | ||
System.out | ||
.println("\tRedisInput <redis hosts> <hashset name> <output>"); | ||
System.out.println("\tPartitionPruningOutput <user data>"); | ||
System.out | ||
.println("\tPartitionPruningInput <last access months> <output>"); | ||
} | ||
} | ||
package mrdp; | ||
|
||
import java.util.Arrays; | ||
|
||
import mrdp.ch1.*; | ||
import mrdp.ch2.*; | ||
import mrdp.ch3.*; | ||
import mrdp.ch4.*; | ||
import mrdp.ch5.*; | ||
import mrdp.ch6.*; | ||
import mrdp.ch7.*; | ||
import mrdp.utils.MRDPUtils; | ||
|
||
import org.apache.hadoop.conf.Configuration; | ||
import org.apache.hadoop.conf.Configured; | ||
import org.apache.hadoop.util.Tool; | ||
import org.apache.hadoop.util.ToolRunner; | ||
|
||
@SuppressWarnings("unused") | ||
public class MRDPMain extends Configured implements Tool { | ||
|
||
public static void main(String[] args) throws Exception { | ||
System.exit(ToolRunner.run(new Configuration(), new MRDPMain(), args)); | ||
} | ||
|
||
@Override | ||
public int run(String[] args) throws Exception { | ||
if (args.length > 0) { | ||
String example = args[0]; | ||
String[] otherArgs = Arrays.copyOfRange(args, 1, args.length); | ||
|
||
if (example.equalsIgnoreCase("PartitionPruningOutput")) { | ||
PartitionPruningOutputDriver.main(otherArgs); | ||
} else if (example.equalsIgnoreCase("PartitionPruningInput")) { | ||
PartitionPruningInputDriver.main(otherArgs); | ||
} else if (example.equalsIgnoreCase("RedisInput")) { | ||
RedisInputDriver.main(otherArgs); | ||
} else if (example.equalsIgnoreCase("RedisOutput")) { | ||
RedisOutputDriver.main(otherArgs); | ||
} else { | ||
printHelp(); | ||
return 1; | ||
} | ||
|
||
return 0; | ||
} else { | ||
printHelp(); | ||
return 1; | ||
} | ||
} | ||
|
||
private void printHelp() { | ||
System.out | ||
.println("Usage: hadoop jar mrdp.jar <example> <example args>"); | ||
System.out.println("Examples are:"); | ||
System.out.println("Chapter 7:"); | ||
System.out | ||
.println("\tRedisOutput <user data> <redis hosts> <hashset name>"); | ||
System.out | ||
.println("\tRedisInput <redis hosts> <hashset name> <output>"); | ||
System.out.println("\tPartitionPruningOutput <user data>"); | ||
System.out | ||
.println("\tPartitionPruningInput <last access months> <output>"); | ||
} | ||
} |
194 changes: 97 additions & 97 deletions
194
...ava/mrdp/appendixA/BloomFilterDriver.java → ...ava/mrdp/appendixA/BloomFilterDriver.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,97 +1,97 @@ | ||
package mrdp.appendixA; | ||
|
||
import java.io.BufferedReader; | ||
import java.io.InputStreamReader; | ||
import java.util.zip.GZIPInputStream; | ||
|
||
import org.apache.hadoop.conf.Configuration; | ||
import org.apache.hadoop.fs.FSDataOutputStream; | ||
import org.apache.hadoop.fs.FileStatus; | ||
import org.apache.hadoop.fs.FileSystem; | ||
import org.apache.hadoop.fs.Path; | ||
import org.apache.hadoop.util.GenericOptionsParser; | ||
import org.apache.hadoop.util.bloom.BloomFilter; | ||
import org.apache.hadoop.util.bloom.Key; | ||
import org.apache.hadoop.util.hash.Hash; | ||
|
||
public class BloomFilterDriver { | ||
|
||
public static void main(String[] args) throws Exception { | ||
Configuration conf = new Configuration(); | ||
String[] otherArgs = new GenericOptionsParser(conf, args) | ||
.getRemainingArgs(); | ||
if (otherArgs.length != 4) { | ||
System.err | ||
.println("Usage: BloomFilterWriter <inputfile> <nummembers> <falseposrate> <bfoutfile>"); | ||
System.exit(1); | ||
} | ||
|
||
FileSystem fs = FileSystem.get(new Configuration()); | ||
|
||
// Parse command line arguments | ||
Path inputFile = new Path(otherArgs[0]); | ||
int numMembers = Integer.parseInt(otherArgs[1]); | ||
float falsePosRate = Float.parseFloat(otherArgs[2]); | ||
Path bfFile = new Path(otherArgs[3]); | ||
|
||
// Calculate our vector size and optimal K value based on approximations | ||
int vectorSize = getOptimalBloomFilterSize(numMembers, falsePosRate); | ||
int nbHash = getOptimalK(numMembers, vectorSize); | ||
|
||
// create new Bloom filter | ||
BloomFilter filter = new BloomFilter(vectorSize, nbHash, | ||
Hash.MURMUR_HASH); | ||
|
||
// Open file for read | ||
|
||
System.out.println("Training Bloom filter of size " + vectorSize | ||
+ " with " + nbHash + " hash functions, " + numMembers | ||
+ " approximate number of records, and " + falsePosRate | ||
+ " false positive rate"); | ||
|
||
String line = null; | ||
int numRecords = 0; | ||
for (FileStatus status : fs.listStatus(inputFile)) { | ||
BufferedReader rdr; | ||
// if file is gzipped, wrap it in a GZIPInputStream | ||
if (status.getPath().getName().endsWith(".gz")) { | ||
rdr = new BufferedReader(new InputStreamReader( | ||
new GZIPInputStream(fs.open(status.getPath())))); | ||
} else { | ||
rdr = new BufferedReader(new InputStreamReader(fs.open(status | ||
.getPath()))); | ||
} | ||
|
||
System.out.println("Reading " + status.getPath()); | ||
while ((line = rdr.readLine()) != null) { | ||
filter.add(new Key(line.getBytes())); | ||
++numRecords; | ||
} | ||
|
||
rdr.close(); | ||
} | ||
|
||
System.out.println("Trained Bloom filter with " + numRecords | ||
+ " entries."); | ||
|
||
System.out.println("Serializing Bloom filter to HDFS at " + bfFile); | ||
FSDataOutputStream strm = fs.create(bfFile); | ||
filter.write(strm); | ||
|
||
strm.flush(); | ||
strm.close(); | ||
|
||
System.out.println("Done training Bloom filter."); | ||
} | ||
|
||
public static int getOptimalBloomFilterSize(int numRecords, | ||
float falsePosRate) { | ||
int size = (int) (-numRecords * (float) Math.log(falsePosRate) / Math | ||
.pow(Math.log(2), 2)); | ||
return size; | ||
} | ||
|
||
public static int getOptimalK(float numMembers, float vectorSize) { | ||
return (int) Math.round(vectorSize / numMembers * Math.log(2)); | ||
} | ||
} | ||
package mrdp.appendixA; | ||
|
||
import java.io.BufferedReader; | ||
import java.io.InputStreamReader; | ||
import java.util.zip.GZIPInputStream; | ||
|
||
import org.apache.hadoop.conf.Configuration; | ||
import org.apache.hadoop.fs.FSDataOutputStream; | ||
import org.apache.hadoop.fs.FileStatus; | ||
import org.apache.hadoop.fs.FileSystem; | ||
import org.apache.hadoop.fs.Path; | ||
import org.apache.hadoop.util.GenericOptionsParser; | ||
import org.apache.hadoop.util.bloom.BloomFilter; | ||
import org.apache.hadoop.util.bloom.Key; | ||
import org.apache.hadoop.util.hash.Hash; | ||
|
||
public class BloomFilterDriver { | ||
|
||
public static void main(String[] args) throws Exception { | ||
Configuration conf = new Configuration(); | ||
String[] otherArgs = new GenericOptionsParser(conf, args) | ||
.getRemainingArgs(); | ||
if (otherArgs.length != 4) { | ||
System.err | ||
.println("Usage: BloomFilterWriter <inputfile> <nummembers> <falseposrate> <bfoutfile>"); | ||
System.exit(1); | ||
} | ||
|
||
FileSystem fs = FileSystem.get(new Configuration()); | ||
|
||
// Parse command line arguments | ||
Path inputFile = new Path(otherArgs[0]); | ||
int numMembers = Integer.parseInt(otherArgs[1]); | ||
float falsePosRate = Float.parseFloat(otherArgs[2]); | ||
Path bfFile = new Path(otherArgs[3]); | ||
|
||
// Calculate our vector size and optimal K value based on approximations | ||
int vectorSize = getOptimalBloomFilterSize(numMembers, falsePosRate); | ||
int nbHash = getOptimalK(numMembers, vectorSize); | ||
|
||
// create new Bloom filter | ||
BloomFilter filter = new BloomFilter(vectorSize, nbHash, | ||
Hash.MURMUR_HASH); | ||
|
||
// Open file for read | ||
|
||
System.out.println("Training Bloom filter of size " + vectorSize | ||
+ " with " + nbHash + " hash functions, " + numMembers | ||
+ " approximate number of records, and " + falsePosRate | ||
+ " false positive rate"); | ||
|
||
String line = null; | ||
int numRecords = 0; | ||
for (FileStatus status : fs.listStatus(inputFile)) { | ||
BufferedReader rdr; | ||
// if file is gzipped, wrap it in a GZIPInputStream | ||
if (status.getPath().getName().endsWith(".gz")) { | ||
rdr = new BufferedReader(new InputStreamReader( | ||
new GZIPInputStream(fs.open(status.getPath())))); | ||
} else { | ||
rdr = new BufferedReader(new InputStreamReader(fs.open(status | ||
.getPath()))); | ||
} | ||
|
||
System.out.println("Reading " + status.getPath()); | ||
while ((line = rdr.readLine()) != null) { | ||
filter.add(new Key(line.getBytes())); | ||
++numRecords; | ||
} | ||
|
||
rdr.close(); | ||
} | ||
|
||
System.out.println("Trained Bloom filter with " + numRecords | ||
+ " entries."); | ||
|
||
System.out.println("Serializing Bloom filter to HDFS at " + bfFile); | ||
FSDataOutputStream strm = fs.create(bfFile); | ||
filter.write(strm); | ||
|
||
strm.flush(); | ||
strm.close(); | ||
|
||
System.out.println("Done training Bloom filter."); | ||
} | ||
|
||
public static int getOptimalBloomFilterSize(int numRecords, | ||
float falsePosRate) { | ||
int size = (int) (-numRecords * (float) Math.log(falsePosRate) / Math | ||
.pow(Math.log(2), 2)); | ||
return size; | ||
} | ||
|
||
public static int getOptimalK(float numMembers, float vectorSize) { | ||
return (int) Math.round(vectorSize / numMembers * Math.log(2)); | ||
} | ||
} |
Oops, something went wrong.