-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathanalysis_of_czech_language.py
135 lines (110 loc) · 3.88 KB
/
analysis_of_czech_language.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
"""
This program makes analysis of occurrence of individual letters in czech language.
Author: Dominik Rumian
03-21-2019
"""
import operator
import matplotlib.pyplot as plt
import time
def readFile(nameOfFile, letters):
"""
Reads file and stores data about letter count in dictionary letters
:param nameOfFile: string, name of file to be analyzed
:param letters: dictionary, containing count of individual letters
:return: void
"""
file = open(nameOfFile, "r")
letterCount = 0
for line in file:
for char in line:
if char.isalpha():
letters[char.lower()] = letters.get(char.lower(), 0) + 1
letterCount += 1
file.close()
print(letterCount)
def letterAnalysis(dictOfLetters):
"""
Converts dictionary to list of tuples, this list is sorted by descending order
:param dictOfLetters: dictionary
:return: sorted list of tuples in format (letter, count)
"""
lettersKeys = dictOfLetters.keys()
lettersTupleList = []
# Sorting letters according their occurrence in the text
for key in lettersKeys:
lettersTupleList.append((key, dictOfLetters[key]))
lettersTupleList.sort(key = operator.itemgetter(1), reverse = True)
return lettersTupleList
def createPlotDataAllLetters(tupleList):
letters = []
values = []
positions = []
i = 1
for tup in tupleList:
letters.append(tup[0])
values.append(tup[1])
positions.append(i)
i += 1
return letters, values, positions
def createPlotDataVowels(tupleList):
vowels = ['a', 'e', 'i', 'o', 'u', 'á', 'é', 'ě', 'í', 'ó', 'ú', 'ů', 'y', 'ý']
vowelsCount = []
vowelsValues = []
vowelsPositions = []
i = 1
for tup in tupleList:
if tup[0] in vowels:
vowelsCount.append(tup[0])
vowelsValues.append(tup[1])
vowelsPositions.append(i)
i += 1
return vowelsCount, vowelsValues, vowelsPositions
def createPlotDataConsonants(tupleList):
vowels = ['a', 'e', 'i', 'o', 'u', 'á', 'é', 'ě', 'í', 'ó', 'ú', 'ů', 'y', 'ý']
consonantsCount = []
consonantsValues = []
consonantsPositions = []
i = 1
for tup in tupleList:
if tup[0] not in vowels:
consonantsCount.append(tup[0])
consonantsValues.append(tup[1])
consonantsPositions.append(i)
i += 1
return consonantsCount, consonantsValues, consonantsPositions
def plotToPDF(xAxisLabels, yAxisValues, xAxisPositions, xDimension, yDimension, title):
# Setting the dimensions of a graph in hundreds of pixels (width x height)
plt.figure(figsize=(xDimension, yDimension))
# Plotting the graph
plt.bar(xAxisPositions, yAxisValues, tick_label=xAxisLabels, width=0.8, color=['red', 'green', 'blue'])
# Naming the x-axis
plt.xlabel('Písmeno')
# Naming the y-axis
plt.ylabel('Počet výskytů')
# Plot title
plt.title(title)
# Printing to PDF file
title = title + ".pdf"
plt.savefig(title, format='pdf')
def main():
# Starting point of time in the beginning
start = time.time()
letters = {}
# List of books to be analyzed
books = ["bible.txt"]
for book in books:
readFile(book, letters)
tupleList = letterAnalysis(letters)
# Plots graph for all letters
(letters, values, positions) = createPlotDataAllLetters(tupleList)
plotToPDF(letters, values, positions, 14, 6, "All_letters")
# Plots graph for vowels
(letters, values, positions) = createPlotDataVowels(tupleList)
plotToPDF(letters, values, positions, 14, 6, "Vowels_Only")
# Plots graph for consonants
(letters, values, positions) = createPlotDataConsonants(tupleList)
plotToPDF(letters, values, positions, 14, 6, "Consonants_Only")
# Ending point of program run
end = time.time()
print("Time of running: ", end - start)
main()