-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDocument Correlation.py
157 lines (125 loc) · 7.72 KB
/
Document Correlation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
# Create a program for comparing requirements from two different excel documents and export the data as a single excel document for analysis.
# Import the required libraries:
# 'pandas' for data manipulation.
# 'nltk' is the Natural Language Toolkit library used for natural language processing tasks.
# 'sklearn' is scikit-learn library, which provides various tools for machine learning and vectorization.
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
# Downloads necessary resources from 'nltk' called topwords and punkt tokenizer, which are essential for text preprocessing.
nltk.download('stopwords')
nltk.download('punkt')
# Load data from two Excel files into separate dataframes ('df1' and 'df2').
df1 = pd.read_excel("path to file")
df2 = pd.read_excel("path to file")
# Define a function to preprocess the text data.
# This function removes punctuation, converts the text to lowercase, tokenizes it into words, removes stop words, and joins the words back into a single string.
def preprocess_text(text):
if isinstance(text, str):
# Remove punctuation
text = text.translate(str.maketrans('.', ',', string.punctuation))
# Convert to lowercase.
text = text.lower()
# Tokenize text into words.
tokens = word_tokenize(text)
# Remove stop words.
stop_words = set(stopwords.words('english'))
tokens = [word for word in tokens if word not in stop_words]
# Join the words back into a single string.
preprocessed_text = ' '.join(tokens)
return preprocessed_text
elif isinstance(text, (int, float)):
# Convert numerical value to string
return str(text)
else:
# If the input is not a string or a numerical value, it returns an empty string.
return ''
# The 'preprocess_text' function is applied to specific columns of both 'df1' and 'df2' dataframes using the 'apply' method.
# This step preprocesses the text data in these columns by applying the 'preprocess_text' function to each value.
df1['column1'] = df1['column1'].apply(preprocess_text)
df1['column2'] = df1['column2'].apply(preprocess_text)
df1['column3'] = df1['column3'].apply(preprocess_text)
df1['column4'] = df1['column4'].apply(preprocess_text)
df1['column5'] = df1['column5'].apply(preprocess_text)
# Same concept is applied now to df2
df2['column1'] = df2['column1'].apply(preprocess_text)
df2['column2'] = df2['column2'].apply(preprocess_text)
df2['column3'] = df2['column3'].apply(preprocess_text)
df2['column4'] = df2['column4'].apply(preprocess_text)
df2['column5'] = df2['column5'].apply(preprocess_text)
# Initialize a TF-IDF (Term Frequency-Inverse Document Frequency) vectorizer from scikit-learn's 'TfidfVectorizer' class.
# TF-IDF is a numerical statistic that reflects the importance of a word in a document corpus.
tfidf_vectorizer = TfidfVectorizer()
# The vectorizer is fitted on the preprocessed text data from 'df1' using the 'fit_transform' method.
# This step learns the vocabulary from 'df1' and converts the preprocessed text into a matrix representation ('tfidf_matrix1').
tfidf_matrix1 = tfidf_vectorizer.fit_transform(df1['column1'] + ' ' + df1['column2'] + ' ' + df1['column3'] + ' ' + df1['column4'] + ' ' + df1['column5'])
# The preprocessed text data from 'df2' is transformed using the learned vocabulary from 'df1' using the 'transform' method, resulting in 'tfidf_matrix2'.
tfidf_matrix2 = tfidf_vectorizer.transform(df2['column1'] + ' ' + df2['column2'] + ' ' + df2['column3'] + ' ' + df2['column4'] + ' ' + df2['column5'])
# Using the 'cosine_similarity' function from scikit-learn's 'metrics.pairwise' module.
# Calculate the cosine similarity between each pair of requirements from 'tfidf_matrix1' and 'tfidf_matrix2'.
# Cosine similarity measures the similarity between two vectors.
similarity_matrix = cosine_similarity(tfidf_matrix1, tfidf_matrix2)
# A similarity threshold value is set to determine whether two requirements are correlated or not.
# This assigns the value 0.35 to similarity_threshold. Modify this value until your desired outcome is created.
similarity_threshold = 0.35
correlated_requirements = []
# Iterate through the similarity matrix to find pairs of correlated requirements.
for i in range(similarity_matrix.shape[0]):
# Find the index of the maximum similarity value in each row.
max_similarity_idx = similarity_matrix[i].argmax()
max_similarity = similarity_matrix[i, max_similarity_idx]
# If the maximum similarity value in a row is above the threshold, the indices of the correlated requirements are added to the list 'correlated_requirements'.
if max_similarity >= similarity_threshold:
correlated_requirements.append((i, max_similarity_idx))
uncorrelated_rows = []
# Iterate through the rows of df2 and check if the index of the row is present in 'correlated_requirements'.
# If it's not, add the index to the 'uncorrelated_rows' list.
for row_idx in range(len(df2)):
if row_idx not in [req2_idx for _, req2_idx in correlated_requirements]:
uncorrelated_rows.append(row_idx)
# Create an empty list to store the correlated requirement details.
report = []
# Iterate through 'correlated_requirements' and retrieve the details of the correlated requirements from 'df1' and 'df2'.
# Append these details as tuples ('req1_details', 'req2_details') to the report list.
for req1_idx, req2_idx in correlated_requirements:
req1_details = df1.iloc[req1_idx]
req2_details = df2.iloc[req2_idx]
report.append((req1_details, req2_details))
# Iterate through the 'report' list and extract the columns used for calcuation from 'req1_details' and 'req2_details' and assign them to individual variables.
# This allows you to work with the data individually if needed.
for req1_details, req2_details in report:
req1_column1 = req1_details['column1']
req2_column1 = req2_details['column1']
req1_column2 = req1_details['column2']
req2_column2 = req2_details['column2']
req1_column3 = req1_details['column3']
req2_column3 = req2_details['column3']
req1_impactifnotfunded = req1_details['column4']
req2_impactifnotfunded = req2_details['column4']
req1_costcenter = req1_details['column5']
req2_costcenter = req2_details['column5']
# Extract the relevant columns required for the output report by combining the column names from 'df1' and 'df2'.
FY_column_list = list(df1.columns[1:]) + list(df2.columns[1:])
# Create an empty list to store the data for the report.
report_data = []
# Iterate through 'correlated_requirements' again and convert the requirement details for each pair into lists.
# Append these lists to the report_data list, excluding the first column (which contains the index).
for req1_idx, req2_idx in correlated_requirements:
req1_details = df1.iloc[req1_idx].tolist()
req2_details = df2.iloc[req2_idx].tolist()
report_data.append(["Correlated"] + req1_details[1:] + req2_details[1:])
# Iterate through 'uncorrelated_rows' and convert the requirement details for each 'df2' row into a list.
for row_idx in uncorrelated_rows:
req_details = df2.iloc[row_idx].tolist()
# Create a list with empty values for columns from df1.
empty_values = [""] * len(df1.columns[1:])
# Combine the empty values with the uncorrelated row details from 'df2' and append "Uncorrelated" as a column value.
report_data.append(["Uncorrelated"] + empty_values + req_details[1:])
# Export the report to an Excel file
FY_column_list_with_flag = ["Correlation"] + FY_column_list
report_df = pd.DataFrame(report_data, columns=FY_column_list_with_flag)
report_df.to_excel("path to file", index=False)