diff --git a/bigdata/src/G_H_postviews.py b/bigdata/src/G_H_postviews.py new file mode 100644 index 0000000..576eb13 --- /dev/null +++ b/bigdata/src/G_H_postviews.py @@ -0,0 +1,50 @@ +from mrjob.job import MRJob +from mrjob.step import MRStep +import re + +#main class +class MRmostViews(MRJob): + def steps(self): + return [ + MRStep( + mapper=self.get_posts_views, + reducer=self.post_filter + ) + ] + + #get posts ands views + def get_posts_views(self, _, line): + + get_id = r'\sId="(.*?)"' + get_views = r'\sViewCount="(.*?)"' + id = re.search(get_id, line) + views = re.search(get_views, line) + # output + # output + + #group() documentation: https://www.geeksforgeeks.org/re-matchobject-group-function-in-python-regex/ + if id: + post_id = id.group(1) + else: + return "" + if views: + post_views = int(views.group(1).replace('"', "")) + else: + return 0 + # output "11" + # output 1836 + + yield None, {"id": post_id, "count": post_views} + + #get posts, sort them and return top 10 + def post_filter(self, _, line): + posts = list(line) + #sort posts + posts = sorted(posts, key= lambda x: x.get("count"), reverse=True)[:10] + #return top 10 + for i in posts: + yield i.values() + +if __name__ == "__main__": + MRmostViews.run() + \ No newline at end of file