Merge pull request #1 from gjbex/development

gjbex · web-flow · commit c6b1917402f5 · 2019-12-10T14:36:34.000+01:00
Update slides &amp; add MPI example
diff --git a/.gitignore b/.gitignore
@@ -108,3 +108,4 @@ venv.bak/
 *.swp
 *.bak
 ~$scientific_python.pptx
+~$python_hpc.pptx
diff --git a/python_hpc.pptx b/python_hpc.pptx
diff --git a/source-code/sentence-counter/.gitignore b/source-code/sentence-counter/.gitignore
@@ -0,0 +1 @@
+*.pbs.*
diff --git a/source-code/sentence-counter/README.md b/source-code/sentence-counter/README.md
@@ -0,0 +1,15 @@
+SentenceCounter
+===============
+
+Naively (from a linguistic point of view) count sentences in natural
+language text.
+
+What is it?
+-----------
+1. `count_sentences.py`: serial implementation of sentence counting script
+1. `count_sentences_par.py`: MPI implementation of sentence counting
+   script
+1. `run_count.sh`: shell script illustrating how to run the parallel
+   version
+1. `data.txt`, `fragement.txt`, `long-data.txt`: various data sets to
+   test the scripts with
diff --git a/source-code/sentence-counter/count.pbs b/source-code/sentence-counter/count.pbs
@@ -0,0 +1,9 @@
+#!/bin/bash -l
+#PBS -l nodes=1:ppn=4
+#PBS -l walltime=00:10:00
+
+module load Python/2.7.6-foss-2014a
+
+cd $PBS_O_WORKDIR
+
+mpirun ./count_sentences_par.py long-data.txt
diff --git a/source-code/sentence-counter/count_sentences.py b/source-code/sentence-counter/count_sentences.py
@@ -0,0 +1,95 @@
+#!/usr/bin/env python
+"""
+This script is for illustration purposes only, it ignores quite a number
+of natural language features, so it should be considered an example only
+in a tehcnical sense.
+"""
+
+from argparse import ArgumentParser
+import os.path
+import sys
+
+terminators = set(['.', '?', '!'])
+
+
+def extract_prefix(file_name, start_pos, end_pos):
+    global terminators
+    prefix_str = ''
+    index = 0
+    with open(file_name, 'r') as file:
+        file.seek(start_pos)
+        if start_pos < end_pos:
+            c = file.read(1)
+            if not c.isupper():
+                file.seek(start_pos)
+                while index + start_pos < end_pos:
+                    c = file.read(1)
+                    prefix_str += c
+                    index += 1
+                    if c in terminators:
+                        break
+        return (prefix_str, start_pos + index)
+
+
+def extract_suffix(file_name, start_pos, end_pos):
+    global terminators
+    suffix_str = ''
+    with open(file_name, 'r') as file:
+        file.seek(end_pos)
+        if end_pos >= start_pos:
+            c = file.read(1)
+            if c not in terminators:
+                file.seek(end_pos)
+                while end_pos >= start_pos:
+                    c = file.read(1)
+                    if c in terminators:
+                        break
+                    else:
+                        suffix_str = c + suffix_str
+                        current_pos = file.tell()
+                        file.seek(current_pos - 2)
+                        end_pos -= 1
+        return (suffix_str, end_pos)
+
+
+def count_sentences(file_name, start_pos, end_pos):
+    """Parse a given string, returning the number of sentences, as well
+       as the prefix to the first sentence, as well as the suffix to the
+       last."""
+    global terminators
+    with open(file_name, 'r') as file:
+        file.seek(start_pos)
+        count = 0
+        index = start_pos
+        while index <= end_pos:
+            c = file.read(1)
+            if c in terminators:
+                count += 1
+            index += 1
+        return count
+
+
+def main():
+    arg_parser = ArgumentParser(description='count sentences in a text file')
+    arg_parser.add_argument('-v', dest='is_verbose', action='store_true',
+                            help='verbose output for debugging')
+    arg_parser.add_argument('file', metavar='FILE', help='file to parse')
+    options = arg_parser.parse_args()
+    start_pos = 0
+    end_pos = os.path.getsize(options.file)
+    if options.is_verbose:
+        msg = "reading '{0}' from {1} to {2}\n"
+        sys.stderr.write(msg.format(options.file, start_pos, end_pos))
+    prefix, new_start_pos = extract_prefix(options.file, start_pos, end_pos)
+    if options.is_verbose:
+        print("prefix: '{0}', start: {1}".format(prefix, new_start_pos))
+    suffix, new_end_pos = extract_suffix(options.file, start_pos, end_pos)
+    if options.is_verbose:
+        print("suffix: '{0}', end: {1}".format(suffix, new_end_pos))
+    count = count_sentences(options.file, new_start_pos, new_end_pos)
+    print("sentences: {0}".format(count))
+    return 0
+
+if __name__ == '__main__':
+    status = main()
+    sys.exit(status)
diff --git a/source-code/sentence-counter/count_sentences_par.py b/source-code/sentence-counter/count_sentences_par.py
@@ -0,0 +1,114 @@
+#!/usr/bin/env python
+"""
+This script is for illustration purposes only, it ignores quite a number
+of natural language features, so it should be considered an example only
+in a technical sense.
+"""
+
+from argparse import ArgumentParser
+import os.path
+import sys
+from mpi4py import MPI
+
+terminators = set(['.', '?', '!'])
+
+
+def extract_prefix(file_name, start_pos, end_pos):
+    global terminators
+    prefix_str = ''
+    index = 0
+    with open(file_name, 'r') as file:
+        file.seek(start_pos)
+        if start_pos < end_pos:
+            c = file.read(1)
+            if not c.isupper():
+                file.seek(start_pos)
+                while index + start_pos < end_pos:
+                    c = file.read(1)
+                    prefix_str += c
+                    index += 1
+                    if c in terminators:
+                        break
+        return (prefix_str, start_pos + index)
+
+
+def extract_suffix(file_name, start_pos, end_pos):
+    global terminators
+    suffix_str = ''
+    with open(file_name, 'r') as file:
+        file.seek(end_pos)
+        if end_pos >= start_pos:
+            c = file.read(1)
+            if c not in terminators:
+                file.seek(end_pos)
+                while end_pos >= start_pos:
+                    c = file.read(1)
+                    if c in terminators:
+                        break
+                    else:
+                        suffix_str = c + suffix_str
+                        current_pos = file.tell()
+                        file.seek(current_pos - 2)
+                        end_pos -= 1
+        return (suffix_str, end_pos)
+
+
+def count_sentences(file_name, start_pos, end_pos):
+    """Parse a given string, returning the number of sentences, as well
+       as the prefix to the first sentence, as well as the suffix to the
+       last."""
+    global terminators
+    with open(file_name, 'r') as file:
+        file.seek(start_pos)
+        count = 0
+        index = start_pos
+        while index <= end_pos:
+            c = file.read(1)
+            if c in terminators:
+                count += 1
+            index += 1
+        return count
+
+
+def main():
+    comm = MPI.COMM_WORLD
+    size = comm.size
+    rank = comm.rank
+    arg_parser = ArgumentParser(description='count sentences in a text file')
+    arg_parser.add_argument('-v', dest='is_verbose', action='store_true',
+                            help='verbose output for debugging')
+    arg_parser.add_argument('file', metavar='FILE', help='file to parse')
+    options = arg_parser.parse_args()
+    file_size = os.path.getsize(options.file)
+    chunck_size = file_size//size
+    start_pos = chunck_size*rank
+    if rank + 1 < size:
+        end_pos = start_pos + chunck_size - 1
+    else:
+        end_pos = file_size - 1
+    if options.is_verbose:
+        msg = "rank {0} reading '{1}' from {2} to {3}\n"
+        sys.stderr.write(msg.format(rank, options.file, start_pos, end_pos))
+
+    prefix, new_start_pos = extract_prefix(options.file, start_pos, end_pos)
+# send new_start_pos - 1 to previous, to use as new_end_pos
+    if rank > 0:
+        comm.isend(new_start_pos - 1, dest=rank - 1)
+# receive new_new_end_pos from next
+    if rank < size - 1:
+        end_pos = comm.recv(source=rank + 1)
+    if options.is_verbose:
+        msg = "rank {0} reading '{1}' from {2} to {3}\n"
+        sys.stderr.write(msg.format(rank, options.file, new_start_pos,
+                                    end_pos))
+    count = count_sentences(options.file, new_start_pos, end_pos)
+    if options.is_verbose:
+        sys.stderr.write('rank {0} counted {1}\n'.format(rank, count))
+    total = comm.reduce(count, op=MPI.SUM, root=0)
+    if rank == 0:
+        print('sentences: {0}'.format(total))
+    return 0
+
+if __name__ == '__main__':
+    status = main()
+    sys.exit(status)
diff --git a/source-code/sentence-counter/data.txt b/source-code/sentence-counter/data.txt
@@ -0,0 +1,8 @@
+This is a short text with a number of sentneces.  Each sentence should
+be counted, where sentence delimitors would be a dot, a question mark and
+and exclamation mark.  Sentences can span multiple lines as well.  Although
+this seems a simple task, it is nevertheless interesting because it can
+be parallelized using MPI.
+The sequential version should reflect the approach that will be taken
+by the parallel version as well.
+
diff --git a/source-code/sentence-counter/fragment.txt b/source-code/sentence-counter/fragment.txt
@@ -0,0 +1,3 @@
+ start of an imcomplete sentence.  This is added to test the prefix
+feature of the counter.  There is a suffix too, again to test whether
+that works. Now let's see  
diff --git a/source-code/sentence-counter/long-data.txt b/source-code/sentence-counter/long-data.txt
@@ -0,0 +1,10 @@
+This is a text with quite a number of sentences.  It is intended as input
+data for the parallel version of this program.  We are interested to see
+whether it works as expected.  The program should count the number of
+sentences in this text.  Although this is trivial, doing so in parallel
+presents some challenges.  For one thing, one has to split the text
+between processes.  This should be done without scanning the text
+first.  So each process gets a piece of text.  The latter can start with
+a part of a sentence that should be processed together with the previous
+piece.  Hence the start value of a process is the end value of the
+previous, and has to be communicated.
diff --git a/source-code/sentence-counter/run_count.sh b/source-code/sentence-counter/run_count.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+source /apps/leuven/etc/bash.bashrc
+module load openmpi/1.4.3_intel python/2.7.1
+
+mpirun -np 4 ./count_sentences_par.py $@
+

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+ start of an imcomplete sentence. This is added to test the prefix`
	`2`	`+feature of the counter. There is a suffix too, again to test whether`
	`3`	`+that works. Now let's see`