-
Notifications
You must be signed in to change notification settings - Fork 131
data instance for frame cloze problem #383
base: master
Are you sure you want to change the base?
Changes from 4 commits
67e00b1
90a7086
e420699
90afb77
b23d28d
42bedfc
b2b5562
7a04a52
7bfddaa
566a2bf
cd2a062
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,177 @@ | ||
| from typing import Dict, List | ||
|
|
||
| import numpy | ||
| from overrides import overrides | ||
|
|
||
| from ..instance import TextInstance, IndexedInstance | ||
| from ...data_indexer import DataIndexer | ||
|
|
||
|
|
||
| # Functionality supported in this class. | ||
| # | ||
| # raw input line: | ||
| # "event:plant absorb water###participant:water###agent:plant###finalloc:soil" TAB "agent:plant" | ||
| # | ||
| # formatted to: | ||
| # ["plant", "unk", "unk", "unk", "unk", "plant absorb water", | ||
| # "soil", "unk", "unk", "unk", "unk", "unk", | ||
| # "unk", "unk", "unk", "unk", "unk", "unk", | ||
| # "unk", "unk", "unk", "unk", "unk", "unk", | ||
| # "unk", "unk", "water"] | ||
| # | ||
| # padded to a fixed length of say, 5 words per phrase. | ||
| # ["plant P P P P", | ||
| # "unk P P P P", | ||
| # "unk P P P P", | ||
| # "unk P P P P", | ||
| # "unk P P P P", | ||
| # "plant absorb water P P ", | ||
| # ...] | ||
|
|
||
| # TODO PR request for having these in the json as an application specific content | ||
| # the slotnames can vary according to different end applications, e.g., a HowTo tuple, OpenIE tuple ... | ||
| SLOTNAMES_ORDERED = ["agent", "beneficiary", "causer", "context", "definition", "event", | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If you want to have these global variables, you could consider moving them inside
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If you want to always have a fixed set of slots that every dataset uses, where your input includes a one-hot vector for which slot you're trying to fill, or something like that, this should work. The model's weights will be tied to particular slots, so a model trained on one slot configuration will not possibly generalize to any other slot configuration. If you want to make this more modular, you could consider using a separate namespace in the data indexer to keep track of this for you, instead of hard-coding it here. Something similar is done with tag labels here.
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The reason to have this configurable was to be able to train any variant of the input with the same code, by passing a list with the config file upfront. This configuration allows ordering the input on the fly for any types of slots. It appears that there is no possibility to specify these list of slotnames in the config file, so, perhaps having this as a constant may be better than passing through a function. The data indexer does not really do any book keeping because the frame_instance takes care of it. Am I thinking incorrectly?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh, yeah, your preprocess scripts putting everything into this format will take care of that, I think. Another consideration is that you'll be essentially wasting computation and modeling capacity on all of the un-filled slots by filling them with "unk". You could either: (1) leave them empty, and let them be masked in whatever model computation you do later. This will keep the model from wasting its capacity trying to figure out that "unk" means "unfilled". Or (2) instead of having a fixed-sized input here, you could have a key-value input, where you give a list of (slot, value) pairs as input to the model. In this setting, it's also a lot easier to think about how to pass in the query - it's just another (slot, value) pair, just with a Happy to talk about this more in person if it's hard to follow what I'm suggesting here. |
||
| "finalloc", "headverb", "initloc", "input", "output", "manner", | ||
| "patient", "resultant", "timebegin", "timeend", "temporal", "hierarchical", | ||
| "similar", "contemporary", "enables", "mechanism", "condition", "purpose", | ||
| "cause", "openrel", "participant"] | ||
| UNKNOWN_SLOTVAL = "unk" # making an open world assumption, we do not observe all the values | ||
| QUES_SLOTVAL = "ques" # this slot in the frame must be queried/completed. | ||
| MAX_PHRASE_LEN = 6 | ||
|
|
||
| class FrameInstance(TextInstance): | ||
|
|
||
| """ | ||
| A FrameInstance is a kind of TextInstance that has text in multiple slots. This generalizes a FrameInstance. | ||
| """ | ||
| def __init__(self, | ||
| # the input can have only few slots and not all and in any random order. | ||
|
||
| # e.g., "event:plant absorb water###participant:water###agent:plant###finalloc:soil" | ||
| # TAB "agent:plant" | ||
| text: List[str], | ||
| # output is a phrase | ||
| label: str=None): | ||
| super(FrameInstance, self).__init__(label) | ||
| self.text = text # "event:plant absorb water###participant:water###agent:plant" TAB "agent:plant" | ||
|
|
||
| def __str__(self): | ||
| return 'FrameInstance( [' + ',\n'.join(self.text) + '] , ' + str(self.label) + ')' | ||
|
|
||
| @overrides | ||
| # accumulate words from each slot's phrase | ||
|
||
| def words(self) -> Dict[str, List[str]]: | ||
| words = [] | ||
| for phrase in self.text: # phrases | ||
| phrase_words = self._words_from_text(phrase) | ||
| words.extend(phrase_words['words']) | ||
| label_words = self._words_from_text(self.label) | ||
| words.extend(label_words['words']) | ||
| return {"words": words} | ||
|
|
||
| @staticmethod | ||
| # slot_as_string: "participant:water" => name=participant, val=water | ||
| def slot_from(slot_as_string: str, kv_separator: str=":"): | ||
| slot_name_val = slot_as_string.split(kv_separator) | ||
| return {'name': slot_name_val[0], 'val': slot_name_val[1]} | ||
|
|
||
| @staticmethod | ||
| # frame_as_string: "event:plant absorb water###participant:water" TAB "participant:water" | ||
| def unpack_input(frame_as_string: str, kv_separator: str="\t"): | ||
| # no information is lost in lowercasing, and simplifies matching. | ||
| partialframe_query = frame_as_string.lower().split(kv_separator) | ||
| if len(partialframe_query) != 2: | ||
| raise RuntimeError("Unexpected number (not 2) of fields in frame: " + frame_as_string) | ||
| return {'content': partialframe_query[0], 'query': partialframe_query[1]} | ||
|
|
||
| @staticmethod | ||
| def given_slots_from(slots_csv: str, values_separator: str="###", kv_separator: str=":"): | ||
| return dict(map(lambda x: x.split(kv_separator), slots_csv.split(values_separator))) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you add a comment describing this function? It's not clear immediately what it does. You could consider re-writing this like: |
||
|
|
||
| # Performs two types of padding: | ||
| # i) unobserved slots are filled with self.unknown_slotval | ||
| # ii) query slot is masked with self.unknown_queryval | ||
| @staticmethod | ||
| def padded_slots_from(sparse_frame: Dict[str, str], query_slotname: str): | ||
| slots = [] | ||
| for slotname in SLOTNAMES_ORDERED: | ||
| if slotname == query_slotname: # query hence masked | ||
| slots.append(QUES_SLOTVAL) | ||
| elif slotname in sparse_frame: # observed hence as-is | ||
| slots.append(sparse_frame[slotname]) | ||
| else: # unobserved hence padded | ||
| slots.append(UNKNOWN_SLOTVAL) | ||
| return slots | ||
|
|
||
| @classmethod | ||
| @overrides | ||
| def read_from_line(cls, line: str): | ||
| """ | ||
| Reads a FrameInstance from a line. The format is: | ||
| frame represented as list of <role:role value phrase of maxlen 5> TAB <label> | ||
| e.g., from | ||
| event:plant absorb water###participant:water###agent:plant###finalloc:soil | ||
| to | ||
| ["plant", "unk", "unk", "unk", "unk", "plant absorb water", | ||
| "soil", "unk", "unk", "unk", "unk", "unk", | ||
| "unk", "unk", "unk", "unk", "unk", "unk", | ||
| "unk", "unk", "unk", "unk", "unk", "unk", | ||
| "unk", "unk", "water"] | ||
| Provides ordering and sparseness flexibility to the input text. | ||
| """ | ||
| # Extract the query slot name and expected value | ||
| # e.g. from, participant:water, extract the expected slot value "water" | ||
| unpacked_input = cls.unpack_input(line) | ||
| query_slot = cls.slot_from(unpacked_input['query']) | ||
| given_slots = cls.given_slots_from(unpacked_input['content']) | ||
| padded_slots = cls.padded_slots_from(given_slots, query_slot['name']) | ||
| return cls(padded_slots, label=query_slot['val']) | ||
|
|
||
| @overrides | ||
| def to_indexed_instance(self, data_indexer: DataIndexer): | ||
| # a phrase in a slot, is converted from [list of words] to [list of wordids] | ||
|
||
| # this is repeated for every slot, hence a list of list of wordids/integers | ||
| indices_slotvals = [self._index_text(phrase, data_indexer) for phrase in self.text] | ||
| # the label is a phrase, and is converted from [list of words] to [list of wordids] | ||
| indices_label = self._index_text(self.label, data_indexer) | ||
| return IndexedFrameInstance(indices_slotvals, indices_label) | ||
|
|
||
|
|
||
| class IndexedFrameInstance(IndexedInstance): | ||
| # One list of ints make up a slotvalue because a slotvalue is a phrase, | ||
| # and so every word of the phrase is identified with an int id. | ||
| # max length of a phrase is 6, if it is lesser, then pad. | ||
| def __init__(self, word_indices: List[List[int]], label): | ||
| super(IndexedFrameInstance, self).__init__(label) | ||
| self.word_indices = word_indices | ||
|
|
||
| @classmethod | ||
| @overrides | ||
| def empty_instance(cls): | ||
| return IndexedFrameInstance([], label=None) | ||
|
|
||
| @overrides | ||
| def get_padding_lengths(self) -> Dict[str, int]: | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Take a look at how Then, you can see here that the Implementing these two methods like this will allow you to remove the |
||
| # the parent class somehow understands the semantics of these dictionary keys. | ||
| # would be better if these keys were more explicit. | ||
| return {'num_sentence_words': MAX_PHRASE_LEN} | ||
|
||
|
|
||
| @overrides | ||
| def pad(self, padding_lengths: Dict[str, int]): | ||
| """ | ||
| Pads (or truncates) all slot values to the maxlen | ||
| e.g, | ||
| input (phrases corresponding to each, the number of slots is fixed.) | ||
| ["1000", "-1", "-1", "-1", "-1", "1 2 3",..] | ||
| output (padded phrases, as phrases are composed of variable number of words) | ||
| ["1000 0 0 0 0", "-1 0 0 0 0", "-1 0 0 0 0", "-1 0 0 0 0", "-1 0 0 0 0", "1 2 3 0 0",..] | ||
| """ | ||
| truncate_from_right = False | ||
| self.word_indices = [self.pad_word_sequence(indices, padding_lengths, truncate_from_right) | ||
| for indices in self.word_indices] | ||
| self.label = self.pad_word_sequence(self.label, padding_lengths, truncate_from_right) | ||
|
|
||
| @overrides | ||
| def as_training_data(self): | ||
| frame_matrix = numpy.asarray(self.word_indices, dtype='int32') | ||
| label_list = numpy.asarray(self.label, dtype='int32') | ||
| return frame_matrix, label_list | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,47 @@ | ||
| # pylint: disable=no-self-use,invalid-name | ||
|
|
||
| from deep_qa.data.instances.text_classification.frame_instance import FrameInstance, IndexedFrameInstance | ||
|
|
||
| # Example of a typical input | ||
| line = "event:plant absorb water###participant:water###agent:plant###finalloc:soil"+"\t"+"finalloc:soil" | ||
|
||
|
|
||
|
|
||
| class TestFrameInstance: | ||
|
||
|
|
||
| # TODO more test with boundary conditions on imperfect input. | ||
| def test_slots_unwrap_correctly(self): | ||
| instance = FrameInstance.read_from_line(line) | ||
| # what we get | ||
| machine_label = "" + instance.label | ||
|
||
| machine_slotvals = ""+instance.text.__str__() | ||
| # what we expect | ||
| expected_label = "soil" | ||
| expected_slotsvals = "['plant', 'unk', 'unk', 'unk', 'unk', " \ | ||
| "'plant absorb water', 'ques', 'unk', 'unk', " \ | ||
| "'unk', 'unk', 'unk', 'unk', 'unk', 'unk', " \ | ||
| "'unk', 'unk', 'unk', 'unk', 'unk', 'unk', 'unk', " \ | ||
| "'unk', 'unk', 'unk', 'unk', 'water']" | ||
| # do they match? | ||
| assert machine_label == expected_label | ||
| assert machine_slotvals == expected_slotsvals | ||
|
|
||
| def test_words_from_frame_aggregated_correctly(self): | ||
| instance = FrameInstance.read_from_line(line) | ||
| assert instance.words()['words'].__len__() == 30 | ||
|
||
|
|
||
|
|
||
| class TestIndexedFrameInstance: | ||
|
|
||
| def test_words_from_frame_aggregated_correctly(self): | ||
| # TODO open a PR request to provide fixed length padding e.g. 6 | ||
| indexed_instance = IndexedFrameInstance([[1000], [1, 2, 3, 4, 5, 6, 7, 8], | ||
| [1, 2, 3]], [1, 2, 3]) | ||
| # unpadded label should be read correctly. | ||
| assert indexed_instance.label.__str__() == "[1, 2, 3]" | ||
| padding_lengths = indexed_instance.get_padding_lengths() | ||
| assert padding_lengths.__str__() == "{'num_sentence_words': 6}" | ||
|
||
| indexed_instance.pad(padding_lengths) | ||
| assert indexed_instance.label.__str__() == "[1, 2, 3, 0, 0, 0]" | ||
| assert indexed_instance.word_indices.__str__() == "[[1000, 0, 0, 0, 0, 0], " \ | ||
| "[1, 2, 3, 4, 5, 6], " \ | ||
| "[1, 2, 3, 0, 0, 0]]" | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Remove these commented out sections, as it's duplicated below in
read_from_line.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.