-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
72 lines (52 loc) · 2.97 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
# A PDF to text scanner. Planned to use as resume recommendation system.
# references:
# Links to all possible libs about pdf to text :
# https://stackabuse.com/working-with-pdfs-in-python-reading-and-splitting-pages/
# links to working system, using pypdf2, textract & nltk libs:
# https://medium.com/better-programming/how-to-convert-pdfs-into-searchable-key-words-with-python-85aab86c544f
# Links to other related things:
# https://automatetheboringstuff.com/chapter13/
# https://stackoverflow.com/questions/17098675/searching-text-in-a-pdf-using-python
# import libs
import PyPDF2 # using this to convert text based pdf into text
import textract # using this to convert scanned pdf files into text
# import nltk # using this to clean[remove punctuation] & convert phrases into keywords
# if any error prompt if required then follow as it says. for me it is download >>>nltk.download('punkt') & stopwords
# The word_tokenize() function will break our text phrases into individual words.
from nltk.tokenize import word_tokenize
# We initialize the stopwords variable, which is a list of words like "The," "I," "and," etc.
# that don't hold much value as keywords.
from nltk.corpus import stopwords
# PART 1 : Read PDF file # if there many files use for loops
filename = r'C:\Users\bakta\Desktop\testzone\testpdf.pdf' # for better efficient use pathlib (python 3.4 & above)
pdfFileObj = open(filename, 'rb') # rb: Opens a file for reading only in binary format.
pdfReader = PyPDF2.PdfFileReader(pdfFileObj) # a readable obj that will be parsed
# checks for number of pages that needed to be parsed
num_pages = pdfReader.numPages
count = 0
text = ""
# while loop will read each page
while count < num_pages:
pageObj = pdfReader.getPage(count) # create a obj to store the page
count += 1
text += pageObj.extractText() # store all the text elements into text as string
# This if statement exists to check if the above library returned words.
# It's done because PyPDF2 cannot read scanned files.
if text != "":
text = text
else :
text = textract.process(filename, method='tesseract', language='eng') # (file path, method,encoding)
# print(text)
# text variable contains all the text elements & contains lot of spaces & junk etc.
# we clean our text and return it as a list of keywords
# -------------------------------FOR FAR LOOPS WORKS, TESTED
# PART 2: Convert text into keywords
tokens = word_tokenize(text) # breaks text phrases into words
punctuations = ['(',')',';',':','[',']',','] # list of punctuations to clean
stop_words = stopwords.words('english') # removes useless word , refer import for more details
# We create a list comprehension that only returns a list of words that are
# NOT IN stop_words and NOT IN punctuations.
keywords = [word for word in tokens if not word in stop_words and not word in punctuations]
print(keywords)
# based on pdf format some text may distorted. So not 100% accurate as expected.
# Just need to format the pdf the way it able to detect accurately.