diff --git a/get_metrics.py b/get_metrics.py new file mode 100644 index 0000000..0339a4a --- /dev/null +++ b/get_metrics.py @@ -0,0 +1,374 @@ + +import matplotlib +#gui_env = ['TKAgg','GTKAgg','Qt4Agg','WXAgg'] + +#for gui in gui_env: +# try: + # print("testing", gui) +matplotlib.use('TKAgg',warn=False, force=True) +from matplotlib import pyplot as plt +#break +#except: +# continue +#print("Using:",matplotlib.get_backend()) + +import sys +import os +from collections import Counter, OrderedDict +import numpy as np +from operator import itemgetter +import matplotlib.pyplot as plt +from astropy.table import Table +import schwimmbad +from cesium.time_series import TimeSeries +import cesium.featurize as featurize +from tqdm import tnrange, tqdm_notebook +import sklearn +from sklearn.model_selection import StratifiedShuffleSplit +from sklearn.decomposition import PCA +from sklearn.ensemble import RandomForestClassifier +from sklearn.metrics import confusion_matrix +import seaborn as sns +import pandas as pd + + +def regular_log_loss(y_true, y_pred, relative_class_weights=None): + """ + Implementation of weighted log loss used for the Kaggle challenge + """ + predictions = y_pred.copy() + # sanitize predictions + epsilon = sys.float_info.epsilon + # this is machine dependent but essentially prevents log(0) + predictions = np.clip(predictions, epsilon, 1.0 - epsilon) + predictions = predictions / np.sum(predictions, axis=1)[:, np.newaxis] + predictions = np.log(predictions) + # multiplying the arrays is equivalent to a truth mask as y_true only contains zeros and ones + class_logloss=[] + uniform_class_weights= np.ones(predictions.shape[1]) + + for i in range(predictions.shape[1]): + # average column wise log loss with truth mask applies + result = np.average(predictions[:, i][y_true[:, i] == 1]) + class_logloss.append(result) + return -1 * np.average(class_logloss, weights=uniform_class_weights) + + + + +def plasticc_log_loss(y_true, y_pred, relative_class_weights=None): + """ + Implementation of weighted log loss used for the Kaggle challenge + """ + predictions = y_pred.copy() + # sanitize predictions + epsilon = sys.float_info.epsilon # this is machine dependent but essentially prevents log(0) + predictions = np.clip(predictions, epsilon, 1.0 - epsilon) + predictions = predictions / np.sum(predictions, axis=1)[:, np.newaxis] + predictions = np.log(predictions) + # multiplying the arrays is equivalent to a truth mask as y_true only contains zeros and ones + class_logloss=[] + for i in range(predictions.shape[1]): + # average column wise log loss with truth mask applies + result = np.average(predictions[:, i][y_true[:, i] == 1]) + class_logloss.append(result) + return -1 * np.average(class_logloss, weights=relative_class_weights) + + +def multiclass_brier(y_true, y_pred, relative_class_weights=None): + """ + Implementation of multiclass brier for Kaggle challenge + """ + + predictions=y_pred.copy() + q_each = (predictions - y_true) ** 2 + class_brier = [] + for i in range(predictions.shape[1]): + result = np.average(q_each[:, i][y_true[:, i] == 1]) + class_brier.append(result) + return np.average(class_brier, weights=relative_class_weights) + + + + + +fig, ax = plt.subplots() +pbmap = OrderedDict([(0,'u'), (1,'g'), (2,'r'), (3,'i'), (4, 'z'), (5, 'y')]) + +# it also helps to have passbands associated with a color +pbcols = OrderedDict([(0,'blueviolet'), (1,'green'), (2,'red'),\ + (3,'orange'), (4, 'black'), (5, 'brown')]) + +pbnames = list(pbmap.values()) +datadir = '/Users/reneehlozek/Data/plasticc/' +metafilename = datadir+'training_set_metadata.csv' + +#'/Users/reneehlozek/Dropbox/First_10_submissions/submissions_renorm/validation_truth.csv' +# +metadata = Table.read(metafilename, format='csv') +nobjects = len(metadata) +counts = Counter(metadata['target']) +labels, values = zip(*sorted(counts.items(), key=itemgetter(1))) +nlines = len(labels) + +#print(labels, 'here') + +featurefile = datadir+'plasticc_featuretable.npz' +if os.path.exists(featurefile): + featuretable, _ = featurize.load_featureset(featurefile) +else: + features_list = [] + with tqdm_notebook(total=nobjects, desc="Computing Features") as pbar: + with schwimmbad.MultiPool() as pool: + results = pool.imap(worker, list(tsdict.values())) + for res in results: + features_list.append(res) + pbar.update() + + featuretable = featurize.assemble_featureset(features_list=features_list,\ + time_series=tsdict.values()) + featurize.save_featureset(fset=featuretable, path=featurefile) + + + +old_names = featuretable.columns.values +new_names = ['{}_{}'.format(x, pbmap.get(y,'meta')) for x,y in old_names] +cols = [featuretable[col] for col in old_names] +allfeats = Table(cols, names=new_names) +del featuretable + + +splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=42) +splits = list(splitter.split(allfeats, metadata['target']))[0] +train_ind, test_ind = splits + + +corr = allfeats.to_pandas().corr() + +# Generate a mask for the upper triangle +mask = np.zeros_like(corr, dtype=np.bool) +mask[np.triu_indices_from(mask)] = True + +# Set up the matplotlib figure +fig, ax = plt.subplots(figsize=(10, 8)) + +# Draw the heatmap with the mask and correct aspect ratio +corr_plot = sns.heatmap(corr, mask=mask, cmap='RdBu', center=0, + square=True, linewidths=.2, cbar_kws={"shrink": .5}) + +Xtrain = np.array(allfeats[train_ind].as_array().tolist()) +Ytrain = np.array(metadata['target'][train_ind].tolist()) + +Xtest = np.array(allfeats[test_ind].as_array().tolist()) +Ytest = np.array(metadata['target'][test_ind].tolist()) + + +ncols = len(new_names) +npca = (ncols - 3)//len(pbnames) + 3 + +pca = PCA(n_components=npca, whiten=True, svd_solver="full", random_state=42) +Xtrain_pca = pca.fit_transform(Xtrain) +Xtest_pca = pca.transform(Xtest) + +fig, ax = plt.subplots() +ax.plot(np.arange(npca), pca.explained_variance_ratio_, color='C0') +ax2 = ax.twinx() +ax2.plot(np.arange(npca), np.cumsum(pca.explained_variance_ratio_), color='C1') +ax.set_yscale('log') +ax.set_xlabel('PCA Component') +ax.set_ylabel('Explained Variance Ratio') +ax2.set_ylabel('Cumulative Explained Ratio') +fig.tight_layout() + +plt.savefig('corr.png') +plt.clf() + +original=False +# Original notebook from Gautham +if original: + clf = RandomForestClassifier(n_estimators=200, criterion='gini',\ + oob_score=True, n_jobs=-1, random_state=42,\ + verbose=1, class_weight='balanced', max_features='sqrt') + + clf.fit(Xtrain_pca, Ytrain) + Ypred = clf.predict(Xtest_pca) + + print(Ypred[0:5], 'yoyo') + print(type(Ypred), type(Ytest), 'types') + print(np.shape(Ypred), np.shape(Ytest), 'shapes') + cm = confusion_matrix(Ytest, Ypred, labels=labels) + cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] + annot = np.around(cm, 2) + +# we didn't release the weights with the contest at Kaggle's request +# they have been probed through the leaderboard though +# we leave the reader to adjust this as they see fit +# the official metric is always what is on Kaggle's leaderboard. +# This notebook is solely for demonstration. + + weights = np.ones(nlines) +# we want the actual prediction probabilities + Ypredprob = clf.predict_proba(Xtest_pca) + print(Ypred[0:5]) + print(Ytest, 'truth labels', np.shape(Ytest)) + print(np.shape(Ypred), np.shape(Ytest), 'shapes') +# we also need to express the truth table as a matrix + sklearn_truth = np.zeros((len(Ytest), nlines)) + label_index_map = dict(zip(clf.classes_, np.arange(nlines))) + + print(labels, 'labels') + print(label_index_map, 'mapping') + + for i, x in enumerate(Ytest): + sklearn_truth[i][label_index_map[Ytest[i]]] = 1 + + print(sklearn_truth, 'truth sklearn') + + logloss = plasticc_log_loss(sklearn_truth, Ypredprob, relative_class_weights=weights) + print(logloss, 'logloss') + fig, ax = plt.subplots(figsize=(9,7)) + + sns.heatmap(cm, xticklabels=labels, yticklabels=labels, cmap='Blues', annot=annot, lw=0.5) + ax.set_xlabel('Predicted Label') + ax.set_ylabel('True Label') + ax.set_aspect('equal') + plt.savefig('testcm.png') + + +# Now we are working with challenge entries +else: + name ='validation' #'mikensilogram' #kyleboone' #majortom' #2_MikeSilogram' #'1_Kyle' + + Rprob = pd.read_csv('/Users/reneehlozek/Dropbox/First_10_submissions/submissions_renorm/'+name+'_probs.csv') #, nrows=20) +# print(Rprob) + if name=='validation': + Rprob=Rprob.drop(['99'],axis=1) + #print(Rprob) + Rtruth = pd.read_csv('/Users/reneehlozek/Dropbox/First_10_submissions/submissions_renorm/'+name+'_truth.csv') #, nrows=20) +# print(Rtruth) + if name == 'validation': + weightinds=[51,60,4,91] #,99] + inds=[0,1,7,13] + else: + weightinds=[51,60,4,91,99] + inds=[0,1,7,13,14] + + weightvals = 2 + Rcols=Rprob.columns.tolist() + Rtruthcols=Rtruth.columns.tolist() + print('hey renee cols before') + print(Rcols) + + print('hey renee truth cols before') + print(Rtruthcols) + + + old_adjust=False + + Rcols.pop(0) + Rtruthcols.pop(0) + + print('hey renee cols after') + print(Rcols) + + print('hey renee truth cols after') + print(Rtruthcols) + + if old_adjust: + + # Removing the object ID from the list + if name == '3_MajorTom': + Rcols.pop(-1) + Rtruthcols.pop(-1) + Rtruthcols.pop(0) + if name == '1_Kyle': + Rcols.pop(0) + Rtruthcols.pop(1) + Rtruthcols.pop(0) + if name == '2_MikeSilogram': + Rcols.pop(-2) + Rtruthcols.pop(-2) + Rtruthcols.pop(0) + + truthvals = np.array(Rtruth[Rtruthcols[:]]) +# print(truvals, 'truvals') + + truvals = [str(k[0]) for k in truthvals] + #print(truthvals, 'truthvals') + labels=[j for j in Rcols[:]] + Rprobb = np.array(Rprob[Rcols[:]]) + + print(labels,'hmm labels') + + # Pull off the truth matrix where each column is either a one or zero + +# Making a vector of labels of the true values + #truvalmat = np.array(Rtruth[Rtruthcols[:]]) + truvalmat =np.array(0*Rprobb) + #print(truvalmat) + + indtru = [None] * np.shape(Rprobb)[0] + #truvals = [None] * np.shape(Rprobb)[0] + + for j in range(np.shape(Rprobb)[0]): +# print(np.where(truvalmat[j]), 'where') + + + try: + indtru[j]=labels.index(str(truvals[j])) + +# indtru[j] = np.where(truvals[j]==labels) +# print('==============', j, '================') +# truvals[j] = labels[indtru[j]] + truvalmat[j,indtru[j]]=1 + +# print(indtru[j], truvalmat[j], truthvals[j], 'check truth') +# print(labels[np.argmax(Rprobb[j,:])], np.argmax(Rprobb[j,:]),Rprobb[j,:], 'check prediction') + + except: + print('eep') + print(indtru[j], truvalmat[j], len(labels)) + + # Making a vector of predicted labels from the probabilities + predvals = [labels[np.argmax(Rprobb[j,:])] for j in range(np.shape(truvalmat)[0])] + indpred = [np.argmax(Rprobb[j,:]) for j in range(np.shape(truvalmat)[0])] + +# print(truvals, 'truvals rh') +# print(predvals, 'predvals rh') + + rcm = confusion_matrix(truvals, predvals, labels=labels) + rcm = rcm.astype('float') / rcm.sum(axis=1)[:, np.newaxis] + rannot = np.around(rcm, 2) + + fig, ax = plt.subplots(figsize=(9,7)) + sns.heatmap(rcm, xticklabels=labels, yticklabels=labels, cmap='Blues', annot=rannot, lw=0.5) + ax.set_xlabel('Predicted Label') + ax.set_ylabel('True Label') + ax.set_aspect('equal') + plt.savefig('testrcm_'+name+'.png') + + nclass = np.shape(truvalmat[:,:])[1] + weights = np.ones(np.shape(truvalmat[:,:])[1]) +#['6', '15', '16', '42', '52', '53', '62', '64', '65', '67', '88', '90', '92', '95', '99'] labels + + weights[inds]=2 + + + print(nclass, 'nclass') + print(weights, 'weights') + +# print(truvalmat) + +# print(Rprobb) + # The log loss function takes in matrices of truth and probability + plasticc_logloss = plasticc_log_loss(truvalmat[:,:], Rprobb[:,:], relative_class_weights=weights) + brier = multiclass_brier(truvalmat[:,:], Rprobb[:,:], relative_class_weights=weights) + logloss = regular_log_loss(truvalmat[:,:], Rprobb[:,:], relative_class_weights=weights) + print(name) + print(plasticc_logloss, 'plasticc_logloss') + print(brier, 'brier') + print(logloss, 'normal logloss') + + + + diff --git a/metrics_vals.txt b/metrics_vals.txt new file mode 100644 index 0000000..06eb4bd --- /dev/null +++ b/metrics_vals.txt @@ -0,0 +1,4 @@ +# name plastocc_logloss brier normal logloss +kyleboone 0.6562792989891841 0.22094486474355302 0.6790594402429536 +mikensilogram 0.6583415162647139 0.209418549786171 0.6782521795000361 +majortom 0.670999029043059 0.2293620050129475 0.6973712882709187 diff --git a/metrics_vals_renormed.txt b/metrics_vals_renormed.txt new file mode 100644 index 0000000..4ee30fc --- /dev/null +++ b/metrics_vals_renormed.txt @@ -0,0 +1,5 @@ +# name plasticc_logloss brier normal logloss +kyleboone 4.024469794343425 0.23734975600277658 2.876605991736937 +mikensilogram 3.987492962757713 0.21822754720731813 2.8522279622784206 +majortom 4.038077884649693 0.24882494540039354 2.89741619302860 +validation 3.3651467007009837 0.37609963717503664 2.545505628097807 diff --git a/piechart.py b/piechart.py new file mode 100644 index 0000000..1e09a0d --- /dev/null +++ b/piechart.py @@ -0,0 +1,25 @@ +import matplotlib.pyplot as plt +import matplotlib as mpl +import pandas as pd +mpl.rcParams['font.size'] = 15.0 + + +labels = ['Astronomy', 'Other STEM', 'Not Software', 'Unknown', 'Software/Data Science'] +sizes = [51, 90, 65, 666, 443] +#colors +colors = ['#1F0322', '#8A1C7C', '#DA4167', '#F0BCD4', '#899D78'] + +#ff9999','#66b3ff','#99ff99','#ffcc99', 'cyan'] +#explsion +explode = (0.05,0.05,0.05,0.05,0.02) +fig1, ax1 = plt.subplots(figsize=(15,12)) + +ax1.pie(sizes, colors = colors, labels=labels, autopct='%1.1f%%',startangle=90, pctdistance=0.85, explode = explode) +#draw circle +centre_circle = plt.Circle((0,0),0.70,fc='white') +fig = plt.gcf() +fig.gca().add_artist(centre_circle) +# Equal aspect ratio ensures that pie is drawn as a circle +ax1.axis('equal') +plt.tight_layout() +plt.savefig('pie.png') diff --git a/plasticc_classification_demo.ipynb b/plasticc_classification_demo.ipynb index 1a92ccf..6716696 100644 --- a/plasticc_classification_demo.ipynb +++ b/plasticc_classification_demo.ipynb @@ -16,7 +16,9 @@ { "cell_type": "code", "execution_count": 1, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -58,18 +60,11 @@ }, { "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/gnarayan/anaconda3/envs/plasticc/lib/python3.6/site-packages/sklearn/ensemble/weight_boosting.py:29: DeprecationWarning: numpy.core.umath_tests is an internal NumPy module and should not be imported. It will be removed in a future NumPy release.\n", - " from numpy.core.umath_tests import inner1d\n" - ] - } - ], + "execution_count": 5, + "metadata": { + "collapsed": false + }, + "outputs": [], "source": [ "%matplotlib notebook\n", "import sys\n", @@ -88,7 +83,8 @@ "from sklearn.decomposition import PCA\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.metrics import confusion_matrix\n", - "import seaborn as sns" + "import seaborn as sns\n", + "import pandas as pd" ] }, { @@ -103,7 +99,9 @@ { "cell_type": "code", "execution_count": 3, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "pbmap = OrderedDict([(0,'u'), (1,'g'), (2,'r'), (3,'i'), (4, 'z'), (5, 'y')])\n", @@ -125,14 +123,16 @@ { "cell_type": "code", "execution_count": 4, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { "text/html": [ "Table length=7848\n", - "\n", - "\n", + "
object_idradeclgallgalbddf_boolhostgal_speczhostgal_photozhostgal_photoz_errdistmodmwebvtarget
\n", + "\n", "\n", "\n", "\n", @@ -159,7 +159,7 @@ ], "text/plain": [ "
object_idradeclgal_lgal_bddfhostgal_speczhostgal_photozhostgal_photoz_errdistmodmwebvtarget
int64float64float64float64float64int64float64float64float64float64float64int64
615349.046051-61.943836320.79653-51.75370610.00.00.0nan0.01792
71353.085938-27.784405223.525509-54.46074811.81811.62670.255245.40630.00788
\n", - "object_id ra decl gall ... distmod mwebv target\n", + "object_id ra decl gal_l ... distmod mwebv target\n", " int64 float64 float64 float64 ... float64 float64 int64 \n", "--------- ---------- ---------- ---------- ... ------- ------- ------\n", " 615 349.046051 -61.943836 320.79653 ... nan 0.017 92\n", @@ -191,8 +191,8 @@ } ], "source": [ - "datadir = 'data/'\n", - "metafilename = f'{datadir}/plasticc_training_set_metadata.csv'\n", + "datadir = '/Users/reneehlozek/Data/plasticc'\n", + "metafilename = datadir+'/training_set_metadata.csv'\n", "metadata = Table.read(metafilename, format='csv')\n", "nobjects = len(metadata)\n", "metadata" @@ -217,8 +217,10 @@ }, { "cell_type": "code", - "execution_count": 5, - "metadata": {}, + "execution_count": 12, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -1003,7 +1005,7 @@ { "data": { "text/html": [ - "" + "" ], "text/plain": [ "" @@ -1016,7 +1018,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/Users/gnarayan/anaconda3/envs/plasticc/lib/python3.6/site-packages/scipy/stats/stats.py:1713: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.\n", + "//anaconda/envs/plasticc3/lib/python3.5/site-packages/scipy/stats/stats.py:1713: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.\n", " return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval\n" ] } @@ -1045,8 +1047,10 @@ }, { "cell_type": "code", - "execution_count": 6, - "metadata": {}, + "execution_count": 13, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -1831,7 +1835,7 @@ { "data": { "text/html": [ - "" + "" ], "text/plain": [ "" @@ -1873,15 +1877,17 @@ }, { "cell_type": "code", - "execution_count": 7, - "metadata": {}, + "execution_count": 16, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { "text/html": [ "Table length=1421705\n", - "
\n", - "\n", + "
object_idmjdpassbandfluxflux_errdetected_bool
\n", + "\n", "\n", "\n", "\n", @@ -1908,39 +1914,39 @@ ], "text/plain": [ "
object_idmjdpassbandfluxflux_errdetected
int64float64int64float64float64int64
61559750.42292-544.8103033.6229521
61559750.43061-816.4343265.553371
\n", - "object_id mjd passband flux flux_err detected_bool\n", - " int64 float64 int64 float64 float64 int64 \n", - "--------- ---------- -------- ------------ --------- -------------\n", - " 615 59750.4229 2 -544.810303 3.622952 1\n", - " 615 59750.4306 1 -816.434326 5.55337 1\n", - " 615 59750.4383 3 -471.385529 3.801213 1\n", - " 615 59750.445 4 -388.984985 11.395031 1\n", - " 615 59752.407 2 -681.858887 4.041204 1\n", - " 615 59752.4147 1 -1061.457031 6.472994 1\n", - " 615 59752.4224 3 -524.95459 3.552751 1\n", - " 615 59752.4334 4 -393.480225 3.599346 1\n", - " 615 59752.4435 5 -355.88678 10.421921 1\n", - " 615 59767.2968 2 -548.01355 3.462291 1\n", - " ... ... ... ... ... ...\n", - "130779836 60542.0489 4 -60.500492 14.743795 0\n", - "130779836 60543.0247 4 -48.527161 24.00408 0\n", - "130779836 60545.9844 5 32.006413 77.931732 0\n", - "130779836 60546.9804 5 68.152985 56.351048 0\n", - "130779836 60548.9789 4 -60.066154 34.353317 0\n", - "130779836 60555.9838 4 -39.881969 46.477093 0\n", - "130779836 60560.0459 1 14.894439 18.947685 0\n", - "130779836 60571.0225 5 30.59313 50.69529 0\n", - "130779836 60585.9974 4 -23.471439 44.819859 0\n", - "130779836 60588.0372 0 -41.214264 51.665123 0" + "object_id mjd passband flux flux_err detected\n", + " int64 float64 int64 float64 float64 int64 \n", + "--------- ---------- -------- ------------ --------- --------\n", + " 615 59750.4229 2 -544.810303 3.622952 1\n", + " 615 59750.4306 1 -816.434326 5.55337 1\n", + " 615 59750.4383 3 -471.385529 3.801213 1\n", + " 615 59750.445 4 -388.984985 11.395031 1\n", + " 615 59752.407 2 -681.858887 4.041204 1\n", + " 615 59752.4147 1 -1061.457031 6.472994 1\n", + " 615 59752.4224 3 -524.95459 3.552751 1\n", + " 615 59752.4334 4 -393.480225 3.599346 1\n", + " 615 59752.4435 5 -355.88678 10.421921 1\n", + " 615 59767.2968 2 -548.01355 3.462291 1\n", + " ... ... ... ... ... ...\n", + "130779836 60542.0489 4 -60.500492 14.743795 0\n", + "130779836 60543.0247 4 -48.527161 24.00408 0\n", + "130779836 60545.9844 5 32.006413 77.931732 0\n", + "130779836 60546.9804 5 68.152985 56.351048 0\n", + "130779836 60548.9789 4 -60.066154 34.353317 0\n", + "130779836 60555.9838 4 -39.881969 46.477093 0\n", + "130779836 60560.0459 1 14.894439 18.947685 0\n", + "130779836 60571.0225 5 30.59313 50.69529 0\n", + "130779836 60585.9974 4 -23.471439 44.819859 0\n", + "130779836 60588.0372 0 -41.214264 51.665123 0" ] }, - "execution_count": 7, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "lcfilename = f'{datadir}/plasticc_training_set.csv.gz'\n", + "lcfilename = datadir+'/training_set.csv'\n", "lcdata = Table.read(lcfilename, format='csv')\n", "lcdata" ] @@ -1954,13 +1960,15 @@ }, { "cell_type": "code", - "execution_count": 8, - "metadata": {}, + "execution_count": 17, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "38742c4ea0264f4ebb100bb690ec1769", + "model_id": "79951c6db3224da69dc365516f005683", "version_major": 2, "version_minor": 0 }, @@ -2014,8 +2022,10 @@ }, { "cell_type": "code", - "execution_count": 9, - "metadata": {}, + "execution_count": 18, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "features_to_use = [\"amplitude\",\n", @@ -2033,8 +2043,10 @@ }, { "cell_type": "code", - "execution_count": 10, - "metadata": {}, + "execution_count": 19, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "# we'll turn off warnings for a bit, because numpy can be whiny. \n", @@ -2057,8 +2069,10 @@ }, { "cell_type": "code", - "execution_count": 11, - "metadata": {}, + "execution_count": 20, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "def worker(tsobj):\n", @@ -2071,11 +2085,35 @@ }, { "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], + "execution_count": 21, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "58939229a39c4e05a2274b7a65251d16", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, description='Computing Features', max=7848), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], "source": [ - "featurefile = f'{datadir}/plasticc_featuretable.npz'\n", + "featurefile = datadir + '/plasticc_featuretable.npz'\n", "if os.path.exists(featurefile):\n", " featuretable, _ = featurize.load_featureset(featurefile)\n", "else:\n", @@ -2101,8 +2139,10 @@ }, { "cell_type": "code", - "execution_count": 13, - "metadata": {}, + "execution_count": 22, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "old_names = featuretable.columns.values\n", @@ -2123,8 +2163,10 @@ }, { "cell_type": "code", - "execution_count": 14, - "metadata": {}, + "execution_count": 23, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=42)\n", @@ -2143,8 +2185,10 @@ }, { "cell_type": "code", - "execution_count": 15, - "metadata": {}, + "execution_count": 24, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -2929,7 +2973,7 @@ { "data": { "text/html": [ - "" + "" ], "text/plain": [ "" @@ -2963,8 +3007,10 @@ }, { "cell_type": "code", - "execution_count": 16, - "metadata": {}, + "execution_count": 25, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "Xtrain = np.array(allfeats[train_ind].as_array().tolist())\n", @@ -2976,8 +3022,10 @@ }, { "cell_type": "code", - "execution_count": 17, - "metadata": {}, + "execution_count": 26, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "ncols = len(new_names)\n", @@ -2986,8 +3034,10 @@ }, { "cell_type": "code", - "execution_count": 18, - "metadata": {}, + "execution_count": 27, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "pca = PCA(n_components=npca, whiten=True, svd_solver=\"full\", random_state=42)\n", @@ -2997,8 +3047,10 @@ }, { "cell_type": "code", - "execution_count": 19, - "metadata": {}, + "execution_count": 28, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -3783,7 +3835,7 @@ { "data": { "text/html": [ - "" + "" ], "text/plain": [ "" @@ -3814,8 +3866,10 @@ }, { "cell_type": "code", - "execution_count": 20, - "metadata": {}, + "execution_count": 29, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "clf = RandomForestClassifier(n_estimators=200, criterion='gini',\\\n", @@ -3832,16 +3886,18 @@ }, { "cell_type": "code", - "execution_count": 21, - "metadata": {}, + "execution_count": 30, + "metadata": { + "collapsed": false + }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "[Parallel(n_jobs=-1)]: Done 42 tasks | elapsed: 0.3s\n", - "[Parallel(n_jobs=-1)]: Done 192 tasks | elapsed: 1.6s\n", - "[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed: 1.6s finished\n", + "[Parallel(n_jobs=-1)]: Done 42 tasks | elapsed: 0.2s\n", + "[Parallel(n_jobs=-1)]: Done 192 tasks | elapsed: 0.9s\n", + "[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed: 0.9s finished\n", "[Parallel(n_jobs=4)]: Done 42 tasks | elapsed: 0.0s\n", "[Parallel(n_jobs=4)]: Done 192 tasks | elapsed: 0.1s\n", "[Parallel(n_jobs=4)]: Done 200 out of 200 | elapsed: 0.1s finished\n" @@ -3862,8 +3918,19 @@ }, { "cell_type": "code", - "execution_count": 22, - "metadata": {}, + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "cm = confusion_matrix(Ytest, Ypred, labels=labels)\n", @@ -3880,8 +3947,35 @@ }, { "cell_type": "code", - "execution_count": 23, - "metadata": {}, + "execution_count": 7, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "prob = pd.read_csv('/Users/reneehlozek/Dropbox/First_10_submissions/1_Kyle.csv')\n", + "truth = pd.read_csv('/Users/reneehlozek/Dropbox/First_10_submissions/1_Kyle_truth.csv')\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "cm = confusion_matrix(truth, prob, labels=labels)\n", + "cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]\n", + "annot = np.around(cm, 2)" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -4666,7 +4760,7 @@ { "data": { "text/html": [ - "" + "" ], "text/plain": [ "" @@ -4684,6 +4778,15 @@ "ax.set_aspect('equal')" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + }, { "cell_type": "markdown", "metadata": {}, @@ -4697,8 +4800,10 @@ }, { "cell_type": "code", - "execution_count": 24, - "metadata": {}, + "execution_count": 33, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "def plasticc_log_loss(y_true, y_pred, relative_class_weights=None):\n", @@ -4725,8 +4830,10 @@ }, { "cell_type": "code", - "execution_count": 25, - "metadata": {}, + "execution_count": 34, + "metadata": { + "collapsed": false + }, "outputs": [ { "name": "stdout", @@ -4846,10 +4953,11 @@ } ], "metadata": { + "anaconda-cloud": {}, "kernelspec": { - "display_name": "Python (plasticc)", + "display_name": "Python [plasticc3]", "language": "python", - "name": "plasticc" + "name": "Python [plasticc3]" }, "language_info": { "codemirror_mode": { @@ -4861,7 +4969,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.6" + "version": "3.5.6" } }, "nbformat": 4,