Skip to content

Commit c6b1917

Browse files
authored
Merge pull request #1 from gjbex/development
Update slides & add MPI example
2 parents 9aa47cf + 20724c9 commit c6b1917

11 files changed

+263
-0
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -108,3 +108,4 @@ venv.bak/
108108
*.swp
109109
*.bak
110110
~$scientific_python.pptx
111+
~$python_hpc.pptx

python_hpc.pptx

-24.9 KB
Binary file not shown.
+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
*.pbs.*
+15
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
SentenceCounter
2+
===============
3+
4+
Naively (from a linguistic point of view) count sentences in natural
5+
language text.
6+
7+
What is it?
8+
-----------
9+
1. `count_sentences.py`: serial implementation of sentence counting script
10+
1. `count_sentences_par.py`: MPI implementation of sentence counting
11+
script
12+
1. `run_count.sh`: shell script illustrating how to run the parallel
13+
version
14+
1. `data.txt`, `fragement.txt`, `long-data.txt`: various data sets to
15+
test the scripts with
+9
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
#!/bin/bash -l
2+
#PBS -l nodes=1:ppn=4
3+
#PBS -l walltime=00:10:00
4+
5+
module load Python/2.7.6-foss-2014a
6+
7+
cd $PBS_O_WORKDIR
8+
9+
mpirun ./count_sentences_par.py long-data.txt
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
#!/usr/bin/env python
2+
"""
3+
This script is for illustration purposes only, it ignores quite a number
4+
of natural language features, so it should be considered an example only
5+
in a tehcnical sense.
6+
"""
7+
8+
from argparse import ArgumentParser
9+
import os.path
10+
import sys
11+
12+
terminators = set(['.', '?', '!'])
13+
14+
15+
def extract_prefix(file_name, start_pos, end_pos):
16+
global terminators
17+
prefix_str = ''
18+
index = 0
19+
with open(file_name, 'r') as file:
20+
file.seek(start_pos)
21+
if start_pos < end_pos:
22+
c = file.read(1)
23+
if not c.isupper():
24+
file.seek(start_pos)
25+
while index + start_pos < end_pos:
26+
c = file.read(1)
27+
prefix_str += c
28+
index += 1
29+
if c in terminators:
30+
break
31+
return (prefix_str, start_pos + index)
32+
33+
34+
def extract_suffix(file_name, start_pos, end_pos):
35+
global terminators
36+
suffix_str = ''
37+
with open(file_name, 'r') as file:
38+
file.seek(end_pos)
39+
if end_pos >= start_pos:
40+
c = file.read(1)
41+
if c not in terminators:
42+
file.seek(end_pos)
43+
while end_pos >= start_pos:
44+
c = file.read(1)
45+
if c in terminators:
46+
break
47+
else:
48+
suffix_str = c + suffix_str
49+
current_pos = file.tell()
50+
file.seek(current_pos - 2)
51+
end_pos -= 1
52+
return (suffix_str, end_pos)
53+
54+
55+
def count_sentences(file_name, start_pos, end_pos):
56+
"""Parse a given string, returning the number of sentences, as well
57+
as the prefix to the first sentence, as well as the suffix to the
58+
last."""
59+
global terminators
60+
with open(file_name, 'r') as file:
61+
file.seek(start_pos)
62+
count = 0
63+
index = start_pos
64+
while index <= end_pos:
65+
c = file.read(1)
66+
if c in terminators:
67+
count += 1
68+
index += 1
69+
return count
70+
71+
72+
def main():
73+
arg_parser = ArgumentParser(description='count sentences in a text file')
74+
arg_parser.add_argument('-v', dest='is_verbose', action='store_true',
75+
help='verbose output for debugging')
76+
arg_parser.add_argument('file', metavar='FILE', help='file to parse')
77+
options = arg_parser.parse_args()
78+
start_pos = 0
79+
end_pos = os.path.getsize(options.file)
80+
if options.is_verbose:
81+
msg = "reading '{0}' from {1} to {2}\n"
82+
sys.stderr.write(msg.format(options.file, start_pos, end_pos))
83+
prefix, new_start_pos = extract_prefix(options.file, start_pos, end_pos)
84+
if options.is_verbose:
85+
print("prefix: '{0}', start: {1}".format(prefix, new_start_pos))
86+
suffix, new_end_pos = extract_suffix(options.file, start_pos, end_pos)
87+
if options.is_verbose:
88+
print("suffix: '{0}', end: {1}".format(suffix, new_end_pos))
89+
count = count_sentences(options.file, new_start_pos, new_end_pos)
90+
print("sentences: {0}".format(count))
91+
return 0
92+
93+
if __name__ == '__main__':
94+
status = main()
95+
sys.exit(status)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
#!/usr/bin/env python
2+
"""
3+
This script is for illustration purposes only, it ignores quite a number
4+
of natural language features, so it should be considered an example only
5+
in a technical sense.
6+
"""
7+
8+
from argparse import ArgumentParser
9+
import os.path
10+
import sys
11+
from mpi4py import MPI
12+
13+
terminators = set(['.', '?', '!'])
14+
15+
16+
def extract_prefix(file_name, start_pos, end_pos):
17+
global terminators
18+
prefix_str = ''
19+
index = 0
20+
with open(file_name, 'r') as file:
21+
file.seek(start_pos)
22+
if start_pos < end_pos:
23+
c = file.read(1)
24+
if not c.isupper():
25+
file.seek(start_pos)
26+
while index + start_pos < end_pos:
27+
c = file.read(1)
28+
prefix_str += c
29+
index += 1
30+
if c in terminators:
31+
break
32+
return (prefix_str, start_pos + index)
33+
34+
35+
def extract_suffix(file_name, start_pos, end_pos):
36+
global terminators
37+
suffix_str = ''
38+
with open(file_name, 'r') as file:
39+
file.seek(end_pos)
40+
if end_pos >= start_pos:
41+
c = file.read(1)
42+
if c not in terminators:
43+
file.seek(end_pos)
44+
while end_pos >= start_pos:
45+
c = file.read(1)
46+
if c in terminators:
47+
break
48+
else:
49+
suffix_str = c + suffix_str
50+
current_pos = file.tell()
51+
file.seek(current_pos - 2)
52+
end_pos -= 1
53+
return (suffix_str, end_pos)
54+
55+
56+
def count_sentences(file_name, start_pos, end_pos):
57+
"""Parse a given string, returning the number of sentences, as well
58+
as the prefix to the first sentence, as well as the suffix to the
59+
last."""
60+
global terminators
61+
with open(file_name, 'r') as file:
62+
file.seek(start_pos)
63+
count = 0
64+
index = start_pos
65+
while index <= end_pos:
66+
c = file.read(1)
67+
if c in terminators:
68+
count += 1
69+
index += 1
70+
return count
71+
72+
73+
def main():
74+
comm = MPI.COMM_WORLD
75+
size = comm.size
76+
rank = comm.rank
77+
arg_parser = ArgumentParser(description='count sentences in a text file')
78+
arg_parser.add_argument('-v', dest='is_verbose', action='store_true',
79+
help='verbose output for debugging')
80+
arg_parser.add_argument('file', metavar='FILE', help='file to parse')
81+
options = arg_parser.parse_args()
82+
file_size = os.path.getsize(options.file)
83+
chunck_size = file_size//size
84+
start_pos = chunck_size*rank
85+
if rank + 1 < size:
86+
end_pos = start_pos + chunck_size - 1
87+
else:
88+
end_pos = file_size - 1
89+
if options.is_verbose:
90+
msg = "rank {0} reading '{1}' from {2} to {3}\n"
91+
sys.stderr.write(msg.format(rank, options.file, start_pos, end_pos))
92+
93+
prefix, new_start_pos = extract_prefix(options.file, start_pos, end_pos)
94+
# send new_start_pos - 1 to previous, to use as new_end_pos
95+
if rank > 0:
96+
comm.isend(new_start_pos - 1, dest=rank - 1)
97+
# receive new_new_end_pos from next
98+
if rank < size - 1:
99+
end_pos = comm.recv(source=rank + 1)
100+
if options.is_verbose:
101+
msg = "rank {0} reading '{1}' from {2} to {3}\n"
102+
sys.stderr.write(msg.format(rank, options.file, new_start_pos,
103+
end_pos))
104+
count = count_sentences(options.file, new_start_pos, end_pos)
105+
if options.is_verbose:
106+
sys.stderr.write('rank {0} counted {1}\n'.format(rank, count))
107+
total = comm.reduce(count, op=MPI.SUM, root=0)
108+
if rank == 0:
109+
print('sentences: {0}'.format(total))
110+
return 0
111+
112+
if __name__ == '__main__':
113+
status = main()
114+
sys.exit(status)

source-code/sentence-counter/data.txt

+8
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
This is a short text with a number of sentneces. Each sentence should
2+
be counted, where sentence delimitors would be a dot, a question mark and
3+
and exclamation mark. Sentences can span multiple lines as well. Although
4+
this seems a simple task, it is nevertheless interesting because it can
5+
be parallelized using MPI.
6+
The sequential version should reflect the approach that will be taken
7+
by the parallel version as well.
8+
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
start of an imcomplete sentence. This is added to test the prefix
2+
feature of the counter. There is a suffix too, again to test whether
3+
that works. Now let's see
+10
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
This is a text with quite a number of sentences. It is intended as input
2+
data for the parallel version of this program. We are interested to see
3+
whether it works as expected. The program should count the number of
4+
sentences in this text. Although this is trivial, doing so in parallel
5+
presents some challenges. For one thing, one has to split the text
6+
between processes. This should be done without scanning the text
7+
first. So each process gets a piece of text. The latter can start with
8+
a part of a sentence that should be processed together with the previous
9+
piece. Hence the start value of a process is the end value of the
10+
previous, and has to be communicated.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
#!/bin/bash
2+
3+
source /apps/leuven/etc/bash.bashrc
4+
module load openmpi/1.4.3_intel python/2.7.1
5+
6+
mpirun -np 4 ./count_sentences_par.py $@
7+

0 commit comments

Comments
 (0)