-
Notifications
You must be signed in to change notification settings - Fork 40
/
Copy pathswda_functions.py
100 lines (80 loc) · 3.69 KB
/
swda_functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
#!/usr/bin/env python
"""
Functions for using swda.py to explore the Switchboard Dialog Act Corpus.
"""
__author__ = "Christopher Potts"
__version__ = "2.0"
__license__ = "GNU general public license, version 2"
__maintainer__ = "Christopher Potts"
__email__ = "See the author's website"
######################################################################
import csv
from collections import defaultdict
from operator import itemgetter
from swda import CorpusReader
######################################################################
def swda_education_region():
"""Create a count dictionary relating education and region."""
d = defaultdict(int)
corpus = CorpusReader('swda')
# Iterate throught the transcripts; display_progress=True tracks progress:
for trans in corpus.iter_transcripts(display_progress=True):
d[(trans.from_caller_education, trans.from_caller_dialect_area)] += 1
d[(trans.to_caller_education, trans.to_caller_dialect_area)] += 1
# Turn d into a list of tuples as d.items(), sort it based on the
# second (index 1 member) of those tuples, largest first, and
# print out the results:
for key, val in sorted(list(d.items()), key=itemgetter(1), reverse=True):
print("{} {}".format(key, val))
######################################################################
def tag_counts():
"""Gather and print counts of the tags."""
d = defaultdict(int)
corpus = CorpusReader('swda')
# Loop, counting tags:
for utt in corpus.iter_utterances(display_progress=True):
d[utt.act_tag] += 1
# Print the results sorted by count, largest to smallest:
for key, val in sorted(list(d.items()), key=itemgetter(1), reverse=True):
print("{} {}".format(key, val))
######################################################################
def count_matches():
"""Determine how many utterances have a single precisely matching tree."""
d = defaultdict(int)
corpus = CorpusReader('swda')
for utt in corpus.iter_utterances():
if len(utt.trees) == 1:
if utt.tree_is_perfect_match():
d['match'] += 1
else:
d['mismatch'] += 1
print("match: {} ({} percent)".format(d['match'], d['match']/float(sum(d.values()))))
######################################################################
def act_tags_and_rootlabels():
"""
Create a CSV file named swda-actags-and-rootlabels.csv in
which each utterance utt has its own row consisting of just
utt.act_tag, utt.damsl_act_tag(), and utt.trees[0].node
restricting attention to cases in which utt has a single,
perfectly matching tree associated with it.
"""
csvwriter = csv.writer(open('swda-actags-and-rootlabels.csv', 'wt'))
csvwriter.writerow(['ActTag', 'DamslActTag', 'RootNode'])
corpus = CorpusReader('swda')
for utt in corpus.iter_utterances(display_progress=True):
if utt.tree_is_perfect_match():
csvwriter.writerow([utt.act_tag, utt.damsl_act_tag(), utt.trees[0].label()])
######################################################################
def act_tags_and_text():
"""
Create a CSV file named swda-actags-and-text.csv in
which each utterance utt has its own row consisting of
utt.damsl_act_tag(), and clean-text utterance
This data can be used for training a speechAct classifier
"""
csvwriter = csv.writer(open('swda-acttags-and-text.csv', 'wt'))
csvwriter.writerow(['DamslActTag', 'Text'])
corpus = CorpusReader('swda')
for utt in corpus.iter_utterances(display_progress=True):
clean_words = utt.text_words(filter_disfluency=True)
csvwriter.writerow([utt.damsl_act_tag(), " ".join(clean_words)])