sheffieldnlp · May 12, 2020
diff --git a/‎README.md
+3-2 b/‎README.md
+3-2
diff --git a/‎wmt20/README.md
+37 b/‎wmt20/README.md
+37
diff --git a/‎wmt20/eval_document_annotations.py
+331 b/‎wmt20/eval_document_annotations.py
+331
@@ -1,2 +1,3 @@
-# qe-eval-scripts
-Scripts to process &amp; score QE predictions into WMT format.
+# Scripts to process &amp; score QE predictions
+
+This repository contains scripts to score QE outputs in preparation of a submission at WMT.
@@ -0,0 +1,37 @@
+# WMT'20 QE shared task
+
+All information about the shared task is [here](http://www.statmt.org/wmt20/quality-estimation-task.html).
+
+### Scoring programs
+All written in Python, use `requirements.txt` to install the required modules.
+Then, 
+
+for **Task 1**, use the following scripts:
+* sentence-level DA:  `python sent_evaluate.py -h` 
+* sentence-level DA **multilingual**:  `python sent-multi_evaluate.py -h` 
+
+for **Task 2**, use the following scripts:
+* sentence-level HTER:  `python sent_evaluate.py -h` 
+* word-level HTER:  `python word_evaluate.py -h`
+
+for **Task 3**, use the following scripts[^1]:
+* MQM **score**: `python eval_document_mqm.py -h`
+* MQM **annotations**: `python eval_document_annotations.py -h`
+
+Once you have checked that your system output on the dev data is correctly read by the right script, you can submit it using the CODALAB page corresponding to your subtask.
+
+### Submission platforms
+
+Predicitons should be submitted to a CODALAB page for each subtask:
+
+Task 1,  [sentence-level DA](https://competitions.codalab.org/competitions/24447)
+Task 1, [sentence-level DA **multilingual**](https://competitions.codalab.org/competitions/24447)
+
+Task 2, [sentence-level HTER](https://competitions.codalab.org/competitions/24515)
+Task 2, [word-level HTER](https://competitions.codalab.org/competitions/24728)
+
+Task 3, [doc-level MQM **score**](https://competitions.codalab.org/competitions/24762)
+Task 3, [doc-level **annotations**](https://competitions.codalab.org/competitions/24763)
+
+[^1]:  under MT License (source: [Deep-Spin](https://github.com/deep-spin/qe-evaluation))
+
@@ -0,0 +1,331 @@
+"""Script to evaluate document-level QE as in the WMT19 shared task."""
+
+import argparse
+from collections import defaultdict
+import numpy as np
+
+
+class OverlappingSpans(ValueError):
+    pass
+
+
+class Span(object):
+    def __init__(self, segment, start, end):
+        """A contiguous span of text in a particular segment.
+        Args:
+            segment: ID of the segment (0-based).
+            start: Character-based start position.
+            end: The position right after the last character.
+        """
+        self.segment = segment
+        self.start = start
+        self.end = end
+        assert self.end >= self.start
+
+    def __len__(self):
+        """Returns the length of the span in characters.
+        Note: by convention, an empty span has length 1."""
+        return max(1, self.end - self.start)
+
+    def count_overlap(self, span):
+        """Given another span, returns the number of matched characters.
+        Zero if the two spans are in different segments.
+        Args:
+            span: another Span object.
+        Returns:
+            The number of matched characters.
+        """
+        if self.segment != span.segment:
+            return 0
+        start = max(self.start, span.start)
+        end = min(self.end, span.end)
+        if end >= start:
+            if span.start == span.end or self.start == self.end:
+                assert start == end
+                return 1  # By convention, the overlap with empty spans is 1.
+            else:
+                return end - start
+        else:
+            return 0
+
+
+class Annotation(object):
+    def __init__(self, severity=None, spans=None):
+        """An annotation, which has a severity level (minor, major, or critical)
+        and consists of one or more non-overlapping spans.
+
+        Args:
+            severity: 'minor', 'major', or 'critical'.
+            spans: A list of Span objects.
+        """
+        # make sure that there is no overlap
+        spans = sorted(spans, key=lambda span: (span.segment, span.start))
+        segment = -1
+        for span in spans:
+            if span.segment != segment:
+                # first span in this segment
+                segment = span.segment
+                last_end = span.end
+            else:
+                # second or later span
+                if span.start < last_end:
+                    raise OverlappingSpans()
+                last_end = span.end
+
+        self.severity = severity
+        self.spans = spans
+
+    def __len__(self):
+        """Returns the sum of the span lengths (in characters)."""
+        return sum([len(span) for span in self.spans])
+
+    def count_overlap(self, annotation, severity_match=None):
+        """Given another annotation with the same severity, returns the number
+        of matched characters. If the severities are different, the result is
+        penalized according to a severity match matrix.
+
+        Args:
+            annotation: another Annotation object.
+            severity_match: a dictionary of dictionaries containing match
+            penalties for severity pairs.
+        Returns:
+            The number of matched characters, possibly penalized by a severity
+            mismatch.
+        """
+        # TODO: Maybe normalize by annotation length (e.g. intersection over
+        # union)?
+        # Note: since we're summing the matches, this won't work as expected
+        # if there are overlapping spans (which we assume there aren't).
+        matched = 0
+        for span in self.spans:
+            for annotation_span in annotation.spans:
+                matched += span.count_overlap(annotation_span)
+        # Scale overlap by a coefficient that takes into account mispredictions
+        # of the severity. For example, predicting "major" when the error is
+        # "critical" gives some partial credit. If None, give zero credit unless
+        # the severity is correct.
+        if severity_match:
+            matched *= severity_match[self.severity][annotation.severity]
+        else:
+            matched *= (self.severity == annotation.severity)
+        return matched
+
+    @classmethod
+    def from_fields(cls, fields):
+        """Creates an Annotation object by loading from a list of string fields.
+
+        Args:
+            fields: a list of strings containing annotations information. They
+                are:
+                - segment_id
+                - annotation_start
+                - annotation_length
+                - severity
+
+                The first three fields may contain several integers separated by
+                whitespaces, in case there are multiple spans.
+                The two last fields are ignored.
+                Example: "13 13   229 214 7 4     minor"
+        """
+        segments = list(map(int, fields[0].split(' ')))
+        starts = list(map(int, fields[1].split(' ')))
+        lengths = list(map(int, fields[2].split(' ')))
+        assert len(segments) == len(starts) == len(lengths)
+        severity = fields[3]
+        spans = [Span(segment, start, start + length)
+                 for segment, start, length in zip(segments, starts, lengths)]
+        return cls(severity, spans)
+
+    @classmethod
+    def from_string(cls, line):
+        """Creates an Annotation object by loading from a string.
+        Args:
+            line: tab-separated line containing the annotation information. The
+                fields are:
+                - document_id
+                - segment_id
+                - annotation_start
+                - annotation_length
+                - severity
+
+                Segment id, annotation start and length may contain several
+                integers separated by whitespaces, in case there are multiple
+                spans.
+                Example: "A0034 13 13   229 214 7 4     minor"
+        """
+        # Ignore the last two fields.
+        fields = line.split('\t')
+        assert len(fields) == 5
+        return cls.from_fields(fields[1:])
+
+    def to_string(self):
+        """Return a string representation of this annotation.
+
+        This is the representation expected in the output file, without notes"""
+        segments = []
+        starts = []
+        lengths = []
+        for span in self.spans:
+            segments.append(str(span.segment))
+            starts.append(str(span.start))
+            lengths.append(str(span.end - span.start))
+
+        segment_string = ' '.join(segments)
+        start_string = ' '.join(starts)
+        length_string = ' '.join(lengths)
+        return '\t'.join([segment_string, start_string, length_string,
+                          self.severity])
+
+
+class Evaluator(object):
+    def __init__(self):
+        """A document-level QE evaluator."""
+        # The severity match matrix will give some credit when the
+        # severity is slighted mispredicted ("minor" <> "major" and
+        # "major" <> "critical"), but not for extreme mispredictions
+        # ("minor" <> "critical").
+        self.severity_match = {'minor': {'minor': 1.0,
+                                         'major': 0.5,
+                                         'critical': 0.0},
+                               'major': {'minor': 0.5,
+                                         'major': 1.0,
+                                         'critical': 0.5},
+                               'critical': {'minor': 0.0,
+                                            'major': 0.5,
+                                            'critical': 1.0}}
+
+    def run(self, system, reference, verbose=False):
+        """Given system and reference documents, computes the macro-averaged F1
+        across all documents.
+
+        Args:
+            system: a dictionary mapping names (doc id's) to lists of
+                Annotations produced by a QE system.
+            reference: a dictionary mapping names (doc id's) to lists of
+                reference Annotations.
+        Returns:
+            The macro-averaged F1 score.
+        """
+        total_f1 = 0.
+        for doc_id in system:
+            # both dicts are defaultdics, returning a empty list if there are no
+            # annotations for that doc_id
+            reference_annotations = reference[doc_id]
+            system_annotations = system[doc_id]
+            f1 = self._compare_document(system_annotations,
+                                        reference_annotations)
+            if verbose:
+                print(doc_id)
+                print(f1)
+            total_f1 += f1
+        total_f1 /= len(system)
+        return total_f1
+
+    def _compare_document(self, system, reference):
+        """Compute the F1 score for a single document, given a system output
+        and a reference. This is done by computing a precision according to the
+        best possible matching of annotations from the system's perspective,
+        and a recall according to the best possible matching of annotations
+        from the reference perspective. Gives some partial credit to
+        annotations that match with the wrong severity.
+        Args:
+            system: dictionary mapping doc id's to lists of annotations
+            reference: dictionary mapping doc id's to lists of annotations
+        Returns:
+            The F1 score of a single document.
+        """
+        all_matched = np.zeros((len(system), len(reference)))
+        for i, system_annotation in enumerate(system):
+            for j, reference_annotation in enumerate(reference):
+                matched = reference_annotation.count_overlap(
+                    system_annotation,
+                    severity_match=self.severity_match)
+                all_matched[i, j] = matched
+
+        lengths_sys = np.array([len(annotation) for annotation in system])
+        lengths_ref = np.array([len(annotation) for annotation in reference])
+
+        if lengths_sys.sum() == 0:
+            # no system annotations
+            precision = 1.
+        elif lengths_ref.sum() == 0:
+            # there were no references
+            precision = 0.
+        else:
+            # normalize by annotation length
+            precision_by_annotation = all_matched.max(1) / lengths_sys
+            precision = precision_by_annotation.mean()
+
+        # same as above, for recall now
+        if lengths_ref.sum() == 0:
+            recall = 1.
+        elif lengths_sys.sum() == 0:
+            recall = 0.
+        else:
+            recall_by_annotation = all_matched.max(0) / lengths_ref
+            recall = recall_by_annotation.mean()
+
+        if not precision + recall:
+            f1 = 0.
+        else:
+            f1 = 2*precision*recall / (precision + recall)
+        assert(0. <= f1 <= 1.)
+
+        return f1
+
+
+def load_annotations(file_path):
+    """Loads a file containing annotations for multiple documents.
+
+    The file should contain lines with the following format:
+    <DOCUMENT ID> <LINES> <SPAN START POSITIONS> <SPAN LENGTHS> <SEVERITY>
+
+    Fields are separated by tabs; LINE, SPAN START POSITIONS and SPAN LENGTHS
+    can have a list of values separated by white space.
+
+    Args:
+        file_path: path to the file.
+    Returns:
+        a dictionary mapping document id's to a list of annotations.
+    """
+    annotations = defaultdict(list)
+
+    with open(file_path, 'r', encoding='utf8') as f:
+        for i, line in enumerate(f):
+            line = line.strip()
+            if not line:
+                continue
+
+            fields = line.split('\t')
+            doc_id = fields[0]
+
+            try:
+                annotation = Annotation.from_fields(fields[1:])
+            except OverlappingSpans:
+                msg = 'Overlapping spans when reading line %d of file %s '
+                msg %= (i, file_path)
+                print(msg)
+                continue
+
+            annotations[doc_id].append(annotation)
+
+    return annotations
+
+
+def main():
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument('system', help='System annotations')
+    parser.add_argument('ref', help='Reference annotations')
+    parser.add_argument('-v', help='Show score by document',
+                        action='store_true', dest='verbose')
+    args = parser.parse_args()
+
+    system = load_annotations(args.system)
+    reference = load_annotations(args.ref)
+    evaluator = Evaluator()
+    f1 = evaluator.run(system, reference, args.verbose)
+    print('Final F1:', f1)
+
+
+if __name__ == '__main__':
+    main()