Skip to content

Commit

Permalink
Move MRDP folder to root
Browse files Browse the repository at this point in the history
  • Loading branch information
adamjshook committed Dec 7, 2012
1 parent ad85347 commit 6a0a877
Show file tree
Hide file tree
Showing 44 changed files with 6,744 additions and 6,744 deletions.
Original file line number Diff line number Diff line change
@@ -1,65 +1,65 @@
package mrdp;

import java.util.Arrays;

import mrdp.ch1.*;
import mrdp.ch2.*;
import mrdp.ch3.*;
import mrdp.ch4.*;
import mrdp.ch5.*;
import mrdp.ch6.*;
import mrdp.ch7.*;
import mrdp.utils.MRDPUtils;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

@SuppressWarnings("unused")
public class MRDPMain extends Configured implements Tool {

public static void main(String[] args) throws Exception {
System.exit(ToolRunner.run(new Configuration(), new MRDPMain(), args));
}

@Override
public int run(String[] args) throws Exception {
if (args.length > 0) {
String example = args[0];
String[] otherArgs = Arrays.copyOfRange(args, 1, args.length);

if (example.equalsIgnoreCase("PartitionPruningOutput")) {
PartitionPruningOutputDriver.main(otherArgs);
} else if (example.equalsIgnoreCase("PartitionPruningInput")) {
PartitionPruningInputDriver.main(otherArgs);
} else if (example.equalsIgnoreCase("RedisInput")) {
RedisInputDriver.main(otherArgs);
} else if (example.equalsIgnoreCase("RedisOutput")) {
RedisOutputDriver.main(otherArgs);
} else {
printHelp();
return 1;
}

return 0;
} else {
printHelp();
return 1;
}
}

private void printHelp() {
System.out
.println("Usage: hadoop jar mrdp.jar <example> <example args>");
System.out.println("Examples are:");
System.out.println("Chapter 7:");
System.out
.println("\tRedisOutput <user data> <redis hosts> <hashset name>");
System.out
.println("\tRedisInput <redis hosts> <hashset name> <output>");
System.out.println("\tPartitionPruningOutput <user data>");
System.out
.println("\tPartitionPruningInput <last access months> <output>");
}
}
package mrdp;

import java.util.Arrays;

import mrdp.ch1.*;
import mrdp.ch2.*;
import mrdp.ch3.*;
import mrdp.ch4.*;
import mrdp.ch5.*;
import mrdp.ch6.*;
import mrdp.ch7.*;
import mrdp.utils.MRDPUtils;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

@SuppressWarnings("unused")
public class MRDPMain extends Configured implements Tool {

public static void main(String[] args) throws Exception {
System.exit(ToolRunner.run(new Configuration(), new MRDPMain(), args));
}

@Override
public int run(String[] args) throws Exception {
if (args.length > 0) {
String example = args[0];
String[] otherArgs = Arrays.copyOfRange(args, 1, args.length);

if (example.equalsIgnoreCase("PartitionPruningOutput")) {
PartitionPruningOutputDriver.main(otherArgs);
} else if (example.equalsIgnoreCase("PartitionPruningInput")) {
PartitionPruningInputDriver.main(otherArgs);
} else if (example.equalsIgnoreCase("RedisInput")) {
RedisInputDriver.main(otherArgs);
} else if (example.equalsIgnoreCase("RedisOutput")) {
RedisOutputDriver.main(otherArgs);
} else {
printHelp();
return 1;
}

return 0;
} else {
printHelp();
return 1;
}
}

private void printHelp() {
System.out
.println("Usage: hadoop jar mrdp.jar <example> <example args>");
System.out.println("Examples are:");
System.out.println("Chapter 7:");
System.out
.println("\tRedisOutput <user data> <redis hosts> <hashset name>");
System.out
.println("\tRedisInput <redis hosts> <hashset name> <output>");
System.out.println("\tPartitionPruningOutput <user data>");
System.out
.println("\tPartitionPruningInput <last access months> <output>");
}
}
Original file line number Diff line number Diff line change
@@ -1,97 +1,97 @@
package mrdp.appendixA;

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.util.zip.GZIPInputStream;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.bloom.BloomFilter;
import org.apache.hadoop.util.bloom.Key;
import org.apache.hadoop.util.hash.Hash;

public class BloomFilterDriver {

public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args)
.getRemainingArgs();
if (otherArgs.length != 4) {
System.err
.println("Usage: BloomFilterWriter <inputfile> <nummembers> <falseposrate> <bfoutfile>");
System.exit(1);
}

FileSystem fs = FileSystem.get(new Configuration());

// Parse command line arguments
Path inputFile = new Path(otherArgs[0]);
int numMembers = Integer.parseInt(otherArgs[1]);
float falsePosRate = Float.parseFloat(otherArgs[2]);
Path bfFile = new Path(otherArgs[3]);

// Calculate our vector size and optimal K value based on approximations
int vectorSize = getOptimalBloomFilterSize(numMembers, falsePosRate);
int nbHash = getOptimalK(numMembers, vectorSize);

// create new Bloom filter
BloomFilter filter = new BloomFilter(vectorSize, nbHash,
Hash.MURMUR_HASH);

// Open file for read

System.out.println("Training Bloom filter of size " + vectorSize
+ " with " + nbHash + " hash functions, " + numMembers
+ " approximate number of records, and " + falsePosRate
+ " false positive rate");

String line = null;
int numRecords = 0;
for (FileStatus status : fs.listStatus(inputFile)) {
BufferedReader rdr;
// if file is gzipped, wrap it in a GZIPInputStream
if (status.getPath().getName().endsWith(".gz")) {
rdr = new BufferedReader(new InputStreamReader(
new GZIPInputStream(fs.open(status.getPath()))));
} else {
rdr = new BufferedReader(new InputStreamReader(fs.open(status
.getPath())));
}

System.out.println("Reading " + status.getPath());
while ((line = rdr.readLine()) != null) {
filter.add(new Key(line.getBytes()));
++numRecords;
}

rdr.close();
}

System.out.println("Trained Bloom filter with " + numRecords
+ " entries.");

System.out.println("Serializing Bloom filter to HDFS at " + bfFile);
FSDataOutputStream strm = fs.create(bfFile);
filter.write(strm);

strm.flush();
strm.close();

System.out.println("Done training Bloom filter.");
}

public static int getOptimalBloomFilterSize(int numRecords,
float falsePosRate) {
int size = (int) (-numRecords * (float) Math.log(falsePosRate) / Math
.pow(Math.log(2), 2));
return size;
}

public static int getOptimalK(float numMembers, float vectorSize) {
return (int) Math.round(vectorSize / numMembers * Math.log(2));
}
}
package mrdp.appendixA;

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.util.zip.GZIPInputStream;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.bloom.BloomFilter;
import org.apache.hadoop.util.bloom.Key;
import org.apache.hadoop.util.hash.Hash;

public class BloomFilterDriver {

public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args)
.getRemainingArgs();
if (otherArgs.length != 4) {
System.err
.println("Usage: BloomFilterWriter <inputfile> <nummembers> <falseposrate> <bfoutfile>");
System.exit(1);
}

FileSystem fs = FileSystem.get(new Configuration());

// Parse command line arguments
Path inputFile = new Path(otherArgs[0]);
int numMembers = Integer.parseInt(otherArgs[1]);
float falsePosRate = Float.parseFloat(otherArgs[2]);
Path bfFile = new Path(otherArgs[3]);

// Calculate our vector size and optimal K value based on approximations
int vectorSize = getOptimalBloomFilterSize(numMembers, falsePosRate);
int nbHash = getOptimalK(numMembers, vectorSize);

// create new Bloom filter
BloomFilter filter = new BloomFilter(vectorSize, nbHash,
Hash.MURMUR_HASH);

// Open file for read

System.out.println("Training Bloom filter of size " + vectorSize
+ " with " + nbHash + " hash functions, " + numMembers
+ " approximate number of records, and " + falsePosRate
+ " false positive rate");

String line = null;
int numRecords = 0;
for (FileStatus status : fs.listStatus(inputFile)) {
BufferedReader rdr;
// if file is gzipped, wrap it in a GZIPInputStream
if (status.getPath().getName().endsWith(".gz")) {
rdr = new BufferedReader(new InputStreamReader(
new GZIPInputStream(fs.open(status.getPath()))));
} else {
rdr = new BufferedReader(new InputStreamReader(fs.open(status
.getPath())));
}

System.out.println("Reading " + status.getPath());
while ((line = rdr.readLine()) != null) {
filter.add(new Key(line.getBytes()));
++numRecords;
}

rdr.close();
}

System.out.println("Trained Bloom filter with " + numRecords
+ " entries.");

System.out.println("Serializing Bloom filter to HDFS at " + bfFile);
FSDataOutputStream strm = fs.create(bfFile);
filter.write(strm);

strm.flush();
strm.close();

System.out.println("Done training Bloom filter.");
}

public static int getOptimalBloomFilterSize(int numRecords,
float falsePosRate) {
int size = (int) (-numRecords * (float) Math.log(falsePosRate) / Math
.pow(Math.log(2), 2));
return size;
}

public static int getOptimalK(float numMembers, float vectorSize) {
return (int) Math.round(vectorSize / numMembers * Math.log(2));
}
}
Loading

0 comments on commit 6a0a877

Please sign in to comment.