diff --git a/bigdata/src/D_H_mapper.py b/bigdata/src/D_H_mapper.py new file mode 100644 index 0000000..816e298 --- /dev/null +++ b/bigdata/src/D_H_mapper.py @@ -0,0 +1,19 @@ +#!/usr/bin/env python +"""D_H_mapper.py""" + +import sys + +# input comes from STDIN (standard input) +for line in sys.stdin: + # remove leading and trailing whitespace + line = line.strip() + # split the line into words + words = line.split() + # increase counters + for word in words: + # write the results to STDOUT (standard output); + # what we output here will be the input for the + # Reduce step, i.e. the input for reducer.py + # + # tab-delimited; the trivial word count is 1 + print ('%s\t%s' % (word, 1)) \ No newline at end of file diff --git a/bigdata/src/D_H_reducer.py b/bigdata/src/D_H_reducer.py new file mode 100644 index 0000000..d63d388 --- /dev/null +++ b/bigdata/src/D_H_reducer.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python +"""D_H_reducer.py""" + +from operator import itemgetter +import sys + +current_word = None +current_count = 0 +word = None + +# input comes from STDIN +for line in sys.stdin: + # remove leading and trailing whitespace + line = line.strip() + + # parse the input we got from mapper.py + word, count = line.split('\t', 1) + + # convert count (currently a string) to int + try: + count = int(count) + except ValueError: + # count was not a number, so silently + # ignore/discard this line + continue + + # this IF-switch only works because Hadoop sorts map output + # by key (here: word) before it is passed to the reducer + if current_word == word: + current_count += count + else: + if current_word: + # write result to STDOUT + print ('%s\t%s' % (current_word, current_count)) + current_count = count + current_word = word + +# do not forget to output the last word if needed! +if current_word == word: + print ('%s\t%s' % (current_word, current_count)) \ No newline at end of file