|
| 1 | +import pandas as pd |
| 2 | +import numpy as np |
| 3 | +from string_grouper import match_strings |
| 4 | +import random |
| 5 | +import time |
| 6 | +import os |
| 7 | + |
| 8 | +# mem_limit = '1G' |
| 9 | +# procgov = r'C:\Users\heamu\Source\Repos\process-governor\ProcessGovernor\bin\x64\Debug\procgov.exe' |
| 10 | +# os.popen(f'{procgov} -r -m {mem_limit} -p {os.getpid()}') |
| 11 | +# time.sleep(1) |
| 12 | +progress = 0 |
| 13 | +do_print = True |
| 14 | +companies = pd.read_csv('data/sec__edgar_company_info.csv') |
| 15 | +x0 = 10000 |
| 16 | +Nx = 10000 |
| 17 | +dNx = 1000 |
| 18 | +Nx2 = 500000 |
| 19 | +dNx2 = 50000 |
| 20 | +y0 = 10000 |
| 21 | +Ny = 10000 |
| 22 | +dNy = 10000 |
| 23 | +ns = 10 |
| 24 | +# X = np.append(np.arange(dNx, Nx + 1, dNx), np.arange(dNx2 + dNx2, Nx2 + 1, dNx2)) |
| 25 | +X = np.arange(x0, Nx + 1, dNx) |
| 26 | +Y = np.arange(y0, Ny + 1, dNy) |
| 27 | +means = np.full((len(X), len(Y)), 0) |
| 28 | +for s in range(ns): |
| 29 | + dgrid = [] |
| 30 | + i = 1 |
| 31 | + _ = print('[', flush=True, end='') if do_print else None |
| 32 | + for x in X: |
| 33 | + left_df = companies['Company Name'].iloc[random.sample(range(len(companies)), k = x)] |
| 34 | + if i > 1: |
| 35 | + _ = print(', ', flush=True) if do_print else None |
| 36 | + dseries = [] |
| 37 | + stdseries = [] |
| 38 | + _ = print('[', flush=True, end='') if do_print else None |
| 39 | + j = 1 |
| 40 | + for y in Y: |
| 41 | + if j > 1: |
| 42 | + _ = print(', ', flush=True, end='') if do_print else None |
| 43 | + right_df = companies['Company Name'].iloc[random.sample(range(len(companies)), k = y)] |
| 44 | + t0 = time.time() |
| 45 | + _ = match_strings(right_df, left_df, n_blocks=(1, 1)) |
| 46 | + t1 = time.time() |
| 47 | + dseries += [(t1 - t0)/60] |
| 48 | + progress += 1.0/(ns*len(X)*len(Y)) |
| 49 | + # print(f'Progress {progress:.1%}', end='\x1b[1K\r') |
| 50 | + _ = print(f'{dseries[-1]}', flush=True, end='') if do_print else None |
| 51 | + # _ = print('.', flush=True, end='') if not do_print else None |
| 52 | + j += 1 |
| 53 | + _ = print(']', flush=True, end='') if do_print else None |
| 54 | + dgrid += [dseries] |
| 55 | + i += 1 |
| 56 | + # _ = print(f'{i}/{len(X)}', flush=True) if not do_print else None |
| 57 | + _ = print(']', flush=True) if do_print else None |
| 58 | + means = (np.asarray(dgrid) + s*means)/(s + 1) |
| 59 | + with open(f'runtime_means_x_{x0}-{Nx}_y_{y0}-{Ny}.npy', 'wb') as f: |
| 60 | + np.save(f, means) |
| 61 | + np.save(f, X) |
| 62 | + np.save(f, Y) |
| 63 | + #send_me_mail() |
0 commit comments