peps/scan-ops.py at master · fedora-python/peps · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
#!/usr/bin/env python3
# http://legacy.python.org/dev/peps/pep-0465/
# https://gist.github.com/njsmith/9157645

# usage:
#   python3 scan-ops.py stdlib_path sklearn_path nipy_path

import sys
import os
import os.path
import tokenize
from collections import OrderedDict

NON_SOURCE_TOKENS = [
    tokenize.COMMENT, tokenize.NL, tokenize.ENCODING, tokenize.NEWLINE,
    tokenize.INDENT, tokenize.DEDENT,
    ]

SKIP_OPS = list("(),.:[]{}@;") + ["->", "..."]

class TokenCounts(object):
    def __init__(self, dot_names=[]):
        self.counts = {}
        self.sloc = 0
        self.dot_names = dot_names

    def count(self, path):
        sloc_idxes = set()
        for token in tokenize.tokenize(open(path, "rb").readline):
            if token.type == tokenize.OP:
                self.counts.setdefault(token.string, 0)
                self.counts[token.string] += 1
            if token.string in self.dot_names:
                self.counts.setdefault("dot", 0)
                self.counts["dot"] += 1
            if token.type not in NON_SOURCE_TOKENS:
                sloc_idxes.add(token.start[0])
        self.sloc += len(sloc_idxes)

    @classmethod
    def combine(cls, objs):
        combined = cls()
        for obj in objs:
            for op, count in obj.counts.items():
                combined.counts.setdefault(op, 0)
                combined.counts[op] += count
            combined.sloc += obj.sloc
        return combined

def count_tree(root, **kwargs):
    c = TokenCounts(**kwargs)
    for dirpath, _, filenames in os.walk(root):
        for filename in filenames:
            if filename.endswith(".py"):
                path = os.path.join(dirpath, filename)
                try:
                    c.count(path)
                    sys.stderr.write(".")
                    sys.stderr.flush()
                except Exception as e:
                    sys.stderr.write("\nFailed to read %s: %s\n" % (path, e))
    return c

# count_objs is OrderedDict (name -> TokenCounts)
def summarize(count_objs, out):
    ops = {}
    for count_obj in count_objs.values():
        for op in count_obj.counts:
            ops[op] = []
    for count_obj in count_objs.values():
        for op, row in ops.items():
            count = count_obj.counts.get(op, 0)
            row.append(count / count_obj.sloc)
    titles = ["Op"] + list(count_objs)
    # 4 chars is enough for ops and all numbers.
    column_widths = [max(len(title), 4) for title in titles]

    rows = []
    for op, row in ops.items():
        #rows.append(["``" + op + "``"] + row)
        rows.append([op] + row)

    rows.sort(key=lambda row: row[-1])
    rows.reverse()

    def write_row(entries):
        out.write("  ".join(entries))
        out.write("\n")

    def lines():
        write_row("=" * w for w in column_widths)

    lines()
    write_row(t.rjust(w) for w, t in zip(column_widths, titles))
    lines()
    for row in rows:
        op = row[0]
        if op in SKIP_OPS:
            continue
        # numbers here are avg number of uses per sloc, which is
        # inconveniently small. convert to uses/1e4 sloc
        numbers = row[1:]
        number_strs = [str(int(round(x * 10000))) for x in numbers]
        formatted_row = [op] + number_strs
        write_row(str(e).rjust(w)
                  for w, e in zip(column_widths, formatted_row))
    lines()

def run_projects(names, dot_names, dirs, out):
    assert len(names) == len(dot_names) == len(dirs)
    count_objs = OrderedDict()
    for name, dot_name, dir in zip(names, dot_names, dirs):
        counts = count_tree(dir, dot_names=dot_name)
        count_objs[name] = counts
        out.write("%s: %s sloc\n" % (name, counts.sloc))
    count_objs["combined"] = TokenCounts.combine(count_objs.values())
    summarize(count_objs, out)

if __name__ == "__main__":
    run_projects(["stdlib", "scikit-learn", "nipy"],
                 [[],
                  # https://github.com/numpy/numpy/pull/4351#discussion_r9977913
                  # sklearn fast_dot is used to fix up some optimizations that
                  # are missing from older numpy's, but in modern days is
                  # exactly the same, so it's fair to count. safe_sparse_dot
                  # has hacks to workaround some quirks in scipy.sparse
                  # matrices, but these quirks are also already fixed, so
                  # counting this calls is also fair.
                  ["dot", "fast_dot", "safe_sparse_dot"],
                  ["dot"]],
                 sys.argv[1:],
                 sys.stdout)