ML/preprocess.py at main · DementiAnalytics/ML · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import os
import nltk
import spacy
import syllapy

# Importing tokenizers
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

try:
    nltk.data.find('taggers/averaged_perceptron_tagger')
except LookupError:
    nltk.download('averaged_perceptron_tagger')

try:
    nlp = spacy.load('en_core_web_sm')
except OSError:
    from spacy.cli import download
    download('en_core_web_sm')
    nlp = spacy.load('en_core_web_sm')


def process_and_rename(data_dir):
    """
    Preprocesses files to include 'healthy' and 'dementia' markers in the filenames for further labeling in the model
    """
    for label_folder in ['control', 'dementia']:
        folder_path = os.path.join(data_dir, label_folder)
        label_prefix = "healthy" if label_folder == "control" else "dementia"

        for idx, filename in enumerate(os.listdir(folder_path)):
            if filename.endswith(".txt"):
                old_path = os.path.join(folder_path, filename)
                new_filename = f"{label_prefix}_{idx}.txt"
                new_path = os.path.join(folder_path, new_filename)
                os.rename(old_path, new_path)

def extract_features(text):
    """
    Extracts several linguistic features based on the provided text
    """
    doc = nlp(text)
    sentences = list(doc.sents)
    words = text.split()
    unique_words = set(words)

    word_count = len(words)
    sentence_count = len(sentences)
    avg_sentence_length = word_count / sentence_count if sentence_count > 0 else 0
    type_token_ratio = len(unique_words) / word_count if word_count > 0 else 0

    noun_count = sum(1 for token in doc if token.pos_ == "NOUN")
    pronoun_count = sum(1 for token in doc if token.pos_ == "PRON")
    adjective_count = sum(1 for token in doc if token.pos_ == "ADJ")
    verb_count = sum(1 for token in doc if token.pos_ == "VERB")

    noun_rate = noun_count / word_count if word_count > 0 else 0
    pronoun_rate = pronoun_count / word_count if word_count > 0 else 0
    adjective_rate = adjective_count / word_count if word_count > 0 else 0
    verb_rate = verb_count / word_count if word_count > 0 else 0

    filler_words = ['uh', 'um', 'like', 'you know']
    filler_count = sum(text.lower().split().count(filler) for filler in filler_words)

    syllable_count = sum(syllapy.count(word) for word in words)

    return {
        "word_count": word_count,
        "sentence_count": sentence_count,
        "avg_sentence_length": avg_sentence_length,
        "type_token_ratio": type_token_ratio,
        "noun_rate": noun_rate,
        "pronoun_rate": pronoun_rate,
        "adjective_rate": adjective_rate,
        "verb_rate": verb_rate,
        "filler_count": filler_count,
        "syllable_count": syllable_count
    }

def create_labels(data_dir):
    """
    Creates data labels and feature list for visualization and model input
    """
    features_list = []
    labels = []

    for label_folder in ["control", "dementia"]:
        folder_path = os.path.join(data_dir, label_folder)
        for filename in os.listdir(folder_path):
            if filename.endswith(".txt"):
                filepath = os.path.join(folder_path, filename)
                with open(filepath, 'r', encoding='utf-8') as f:
                    text = f.read()
                features = extract_features(text)
                features_list.append(features)

                if filename.lower().startswith("healthy"):
                    labels.append(0)
                elif filename.lower().startswith("dementia"):
                    labels.append(1)

    return features_list, labels