diff --git a/get_metrics.py b/get_metrics.py
new file mode 100644
index 0000000..0339a4a
--- /dev/null
+++ b/get_metrics.py
@@ -0,0 +1,374 @@
+
+import matplotlib
+#gui_env = ['TKAgg','GTKAgg','Qt4Agg','WXAgg']
+
+#for gui in gui_env:
+# try:
+ # print("testing", gui)
+matplotlib.use('TKAgg',warn=False, force=True)
+from matplotlib import pyplot as plt
+#break
+#except:
+# continue
+#print("Using:",matplotlib.get_backend())
+
+import sys
+import os
+from collections import Counter, OrderedDict
+import numpy as np
+from operator import itemgetter
+import matplotlib.pyplot as plt
+from astropy.table import Table
+import schwimmbad
+from cesium.time_series import TimeSeries
+import cesium.featurize as featurize
+from tqdm import tnrange, tqdm_notebook
+import sklearn
+from sklearn.model_selection import StratifiedShuffleSplit
+from sklearn.decomposition import PCA
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.metrics import confusion_matrix
+import seaborn as sns
+import pandas as pd
+
+
+def regular_log_loss(y_true, y_pred, relative_class_weights=None):
+ """
+ Implementation of weighted log loss used for the Kaggle challenge
+ """
+ predictions = y_pred.copy()
+ # sanitize predictions
+ epsilon = sys.float_info.epsilon
+ # this is machine dependent but essentially prevents log(0)
+ predictions = np.clip(predictions, epsilon, 1.0 - epsilon)
+ predictions = predictions / np.sum(predictions, axis=1)[:, np.newaxis]
+ predictions = np.log(predictions)
+ # multiplying the arrays is equivalent to a truth mask as y_true only contains zeros and ones
+ class_logloss=[]
+ uniform_class_weights= np.ones(predictions.shape[1])
+
+ for i in range(predictions.shape[1]):
+ # average column wise log loss with truth mask applies
+ result = np.average(predictions[:, i][y_true[:, i] == 1])
+ class_logloss.append(result)
+ return -1 * np.average(class_logloss, weights=uniform_class_weights)
+
+
+
+
+def plasticc_log_loss(y_true, y_pred, relative_class_weights=None):
+ """
+ Implementation of weighted log loss used for the Kaggle challenge
+ """
+ predictions = y_pred.copy()
+ # sanitize predictions
+ epsilon = sys.float_info.epsilon # this is machine dependent but essentially prevents log(0)
+ predictions = np.clip(predictions, epsilon, 1.0 - epsilon)
+ predictions = predictions / np.sum(predictions, axis=1)[:, np.newaxis]
+ predictions = np.log(predictions)
+ # multiplying the arrays is equivalent to a truth mask as y_true only contains zeros and ones
+ class_logloss=[]
+ for i in range(predictions.shape[1]):
+ # average column wise log loss with truth mask applies
+ result = np.average(predictions[:, i][y_true[:, i] == 1])
+ class_logloss.append(result)
+ return -1 * np.average(class_logloss, weights=relative_class_weights)
+
+
+def multiclass_brier(y_true, y_pred, relative_class_weights=None):
+ """
+ Implementation of multiclass brier for Kaggle challenge
+ """
+
+ predictions=y_pred.copy()
+ q_each = (predictions - y_true) ** 2
+ class_brier = []
+ for i in range(predictions.shape[1]):
+ result = np.average(q_each[:, i][y_true[:, i] == 1])
+ class_brier.append(result)
+ return np.average(class_brier, weights=relative_class_weights)
+
+
+
+
+
+fig, ax = plt.subplots()
+pbmap = OrderedDict([(0,'u'), (1,'g'), (2,'r'), (3,'i'), (4, 'z'), (5, 'y')])
+
+# it also helps to have passbands associated with a color
+pbcols = OrderedDict([(0,'blueviolet'), (1,'green'), (2,'red'),\
+ (3,'orange'), (4, 'black'), (5, 'brown')])
+
+pbnames = list(pbmap.values())
+datadir = '/Users/reneehlozek/Data/plasticc/'
+metafilename = datadir+'training_set_metadata.csv'
+
+#'/Users/reneehlozek/Dropbox/First_10_submissions/submissions_renorm/validation_truth.csv'
+#
+metadata = Table.read(metafilename, format='csv')
+nobjects = len(metadata)
+counts = Counter(metadata['target'])
+labels, values = zip(*sorted(counts.items(), key=itemgetter(1)))
+nlines = len(labels)
+
+#print(labels, 'here')
+
+featurefile = datadir+'plasticc_featuretable.npz'
+if os.path.exists(featurefile):
+ featuretable, _ = featurize.load_featureset(featurefile)
+else:
+ features_list = []
+ with tqdm_notebook(total=nobjects, desc="Computing Features") as pbar:
+ with schwimmbad.MultiPool() as pool:
+ results = pool.imap(worker, list(tsdict.values()))
+ for res in results:
+ features_list.append(res)
+ pbar.update()
+
+ featuretable = featurize.assemble_featureset(features_list=features_list,\
+ time_series=tsdict.values())
+ featurize.save_featureset(fset=featuretable, path=featurefile)
+
+
+
+old_names = featuretable.columns.values
+new_names = ['{}_{}'.format(x, pbmap.get(y,'meta')) for x,y in old_names]
+cols = [featuretable[col] for col in old_names]
+allfeats = Table(cols, names=new_names)
+del featuretable
+
+
+splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=42)
+splits = list(splitter.split(allfeats, metadata['target']))[0]
+train_ind, test_ind = splits
+
+
+corr = allfeats.to_pandas().corr()
+
+# Generate a mask for the upper triangle
+mask = np.zeros_like(corr, dtype=np.bool)
+mask[np.triu_indices_from(mask)] = True
+
+# Set up the matplotlib figure
+fig, ax = plt.subplots(figsize=(10, 8))
+
+# Draw the heatmap with the mask and correct aspect ratio
+corr_plot = sns.heatmap(corr, mask=mask, cmap='RdBu', center=0,
+ square=True, linewidths=.2, cbar_kws={"shrink": .5})
+
+Xtrain = np.array(allfeats[train_ind].as_array().tolist())
+Ytrain = np.array(metadata['target'][train_ind].tolist())
+
+Xtest = np.array(allfeats[test_ind].as_array().tolist())
+Ytest = np.array(metadata['target'][test_ind].tolist())
+
+
+ncols = len(new_names)
+npca = (ncols - 3)//len(pbnames) + 3
+
+pca = PCA(n_components=npca, whiten=True, svd_solver="full", random_state=42)
+Xtrain_pca = pca.fit_transform(Xtrain)
+Xtest_pca = pca.transform(Xtest)
+
+fig, ax = plt.subplots()
+ax.plot(np.arange(npca), pca.explained_variance_ratio_, color='C0')
+ax2 = ax.twinx()
+ax2.plot(np.arange(npca), np.cumsum(pca.explained_variance_ratio_), color='C1')
+ax.set_yscale('log')
+ax.set_xlabel('PCA Component')
+ax.set_ylabel('Explained Variance Ratio')
+ax2.set_ylabel('Cumulative Explained Ratio')
+fig.tight_layout()
+
+plt.savefig('corr.png')
+plt.clf()
+
+original=False
+# Original notebook from Gautham
+if original:
+ clf = RandomForestClassifier(n_estimators=200, criterion='gini',\
+ oob_score=True, n_jobs=-1, random_state=42,\
+ verbose=1, class_weight='balanced', max_features='sqrt')
+
+ clf.fit(Xtrain_pca, Ytrain)
+ Ypred = clf.predict(Xtest_pca)
+
+ print(Ypred[0:5], 'yoyo')
+ print(type(Ypred), type(Ytest), 'types')
+ print(np.shape(Ypred), np.shape(Ytest), 'shapes')
+ cm = confusion_matrix(Ytest, Ypred, labels=labels)
+ cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
+ annot = np.around(cm, 2)
+
+# we didn't release the weights with the contest at Kaggle's request
+# they have been probed through the leaderboard though
+# we leave the reader to adjust this as they see fit
+# the official metric is always what is on Kaggle's leaderboard.
+# This notebook is solely for demonstration.
+
+ weights = np.ones(nlines)
+# we want the actual prediction probabilities
+ Ypredprob = clf.predict_proba(Xtest_pca)
+ print(Ypred[0:5])
+ print(Ytest, 'truth labels', np.shape(Ytest))
+ print(np.shape(Ypred), np.shape(Ytest), 'shapes')
+# we also need to express the truth table as a matrix
+ sklearn_truth = np.zeros((len(Ytest), nlines))
+ label_index_map = dict(zip(clf.classes_, np.arange(nlines)))
+
+ print(labels, 'labels')
+ print(label_index_map, 'mapping')
+
+ for i, x in enumerate(Ytest):
+ sklearn_truth[i][label_index_map[Ytest[i]]] = 1
+
+ print(sklearn_truth, 'truth sklearn')
+
+ logloss = plasticc_log_loss(sklearn_truth, Ypredprob, relative_class_weights=weights)
+ print(logloss, 'logloss')
+ fig, ax = plt.subplots(figsize=(9,7))
+
+ sns.heatmap(cm, xticklabels=labels, yticklabels=labels, cmap='Blues', annot=annot, lw=0.5)
+ ax.set_xlabel('Predicted Label')
+ ax.set_ylabel('True Label')
+ ax.set_aspect('equal')
+ plt.savefig('testcm.png')
+
+
+# Now we are working with challenge entries
+else:
+ name ='validation' #'mikensilogram' #kyleboone' #majortom' #2_MikeSilogram' #'1_Kyle'
+
+ Rprob = pd.read_csv('/Users/reneehlozek/Dropbox/First_10_submissions/submissions_renorm/'+name+'_probs.csv') #, nrows=20)
+# print(Rprob)
+ if name=='validation':
+ Rprob=Rprob.drop(['99'],axis=1)
+ #print(Rprob)
+ Rtruth = pd.read_csv('/Users/reneehlozek/Dropbox/First_10_submissions/submissions_renorm/'+name+'_truth.csv') #, nrows=20)
+# print(Rtruth)
+ if name == 'validation':
+ weightinds=[51,60,4,91] #,99]
+ inds=[0,1,7,13]
+ else:
+ weightinds=[51,60,4,91,99]
+ inds=[0,1,7,13,14]
+
+ weightvals = 2
+ Rcols=Rprob.columns.tolist()
+ Rtruthcols=Rtruth.columns.tolist()
+ print('hey renee cols before')
+ print(Rcols)
+
+ print('hey renee truth cols before')
+ print(Rtruthcols)
+
+
+ old_adjust=False
+
+ Rcols.pop(0)
+ Rtruthcols.pop(0)
+
+ print('hey renee cols after')
+ print(Rcols)
+
+ print('hey renee truth cols after')
+ print(Rtruthcols)
+
+ if old_adjust:
+
+ # Removing the object ID from the list
+ if name == '3_MajorTom':
+ Rcols.pop(-1)
+ Rtruthcols.pop(-1)
+ Rtruthcols.pop(0)
+ if name == '1_Kyle':
+ Rcols.pop(0)
+ Rtruthcols.pop(1)
+ Rtruthcols.pop(0)
+ if name == '2_MikeSilogram':
+ Rcols.pop(-2)
+ Rtruthcols.pop(-2)
+ Rtruthcols.pop(0)
+
+ truthvals = np.array(Rtruth[Rtruthcols[:]])
+# print(truvals, 'truvals')
+
+ truvals = [str(k[0]) for k in truthvals]
+ #print(truthvals, 'truthvals')
+ labels=[j for j in Rcols[:]]
+ Rprobb = np.array(Rprob[Rcols[:]])
+
+ print(labels,'hmm labels')
+
+ # Pull off the truth matrix where each column is either a one or zero
+
+# Making a vector of labels of the true values
+ #truvalmat = np.array(Rtruth[Rtruthcols[:]])
+ truvalmat =np.array(0*Rprobb)
+ #print(truvalmat)
+
+ indtru = [None] * np.shape(Rprobb)[0]
+ #truvals = [None] * np.shape(Rprobb)[0]
+
+ for j in range(np.shape(Rprobb)[0]):
+# print(np.where(truvalmat[j]), 'where')
+
+
+ try:
+ indtru[j]=labels.index(str(truvals[j]))
+
+# indtru[j] = np.where(truvals[j]==labels)
+# print('==============', j, '================')
+# truvals[j] = labels[indtru[j]]
+ truvalmat[j,indtru[j]]=1
+
+# print(indtru[j], truvalmat[j], truthvals[j], 'check truth')
+# print(labels[np.argmax(Rprobb[j,:])], np.argmax(Rprobb[j,:]),Rprobb[j,:], 'check prediction')
+
+ except:
+ print('eep')
+ print(indtru[j], truvalmat[j], len(labels))
+
+ # Making a vector of predicted labels from the probabilities
+ predvals = [labels[np.argmax(Rprobb[j,:])] for j in range(np.shape(truvalmat)[0])]
+ indpred = [np.argmax(Rprobb[j,:]) for j in range(np.shape(truvalmat)[0])]
+
+# print(truvals, 'truvals rh')
+# print(predvals, 'predvals rh')
+
+ rcm = confusion_matrix(truvals, predvals, labels=labels)
+ rcm = rcm.astype('float') / rcm.sum(axis=1)[:, np.newaxis]
+ rannot = np.around(rcm, 2)
+
+ fig, ax = plt.subplots(figsize=(9,7))
+ sns.heatmap(rcm, xticklabels=labels, yticklabels=labels, cmap='Blues', annot=rannot, lw=0.5)
+ ax.set_xlabel('Predicted Label')
+ ax.set_ylabel('True Label')
+ ax.set_aspect('equal')
+ plt.savefig('testrcm_'+name+'.png')
+
+ nclass = np.shape(truvalmat[:,:])[1]
+ weights = np.ones(np.shape(truvalmat[:,:])[1])
+#['6', '15', '16', '42', '52', '53', '62', '64', '65', '67', '88', '90', '92', '95', '99'] labels
+
+ weights[inds]=2
+
+
+ print(nclass, 'nclass')
+ print(weights, 'weights')
+
+# print(truvalmat)
+
+# print(Rprobb)
+ # The log loss function takes in matrices of truth and probability
+ plasticc_logloss = plasticc_log_loss(truvalmat[:,:], Rprobb[:,:], relative_class_weights=weights)
+ brier = multiclass_brier(truvalmat[:,:], Rprobb[:,:], relative_class_weights=weights)
+ logloss = regular_log_loss(truvalmat[:,:], Rprobb[:,:], relative_class_weights=weights)
+ print(name)
+ print(plasticc_logloss, 'plasticc_logloss')
+ print(brier, 'brier')
+ print(logloss, 'normal logloss')
+
+
+
+
diff --git a/metrics_vals.txt b/metrics_vals.txt
new file mode 100644
index 0000000..06eb4bd
--- /dev/null
+++ b/metrics_vals.txt
@@ -0,0 +1,4 @@
+# name plastocc_logloss brier normal logloss
+kyleboone 0.6562792989891841 0.22094486474355302 0.6790594402429536
+mikensilogram 0.6583415162647139 0.209418549786171 0.6782521795000361
+majortom 0.670999029043059 0.2293620050129475 0.6973712882709187
diff --git a/metrics_vals_renormed.txt b/metrics_vals_renormed.txt
new file mode 100644
index 0000000..4ee30fc
--- /dev/null
+++ b/metrics_vals_renormed.txt
@@ -0,0 +1,5 @@
+# name plasticc_logloss brier normal logloss
+kyleboone 4.024469794343425 0.23734975600277658 2.876605991736937
+mikensilogram 3.987492962757713 0.21822754720731813 2.8522279622784206
+majortom 4.038077884649693 0.24882494540039354 2.89741619302860
+validation 3.3651467007009837 0.37609963717503664 2.545505628097807
diff --git a/piechart.py b/piechart.py
new file mode 100644
index 0000000..1e09a0d
--- /dev/null
+++ b/piechart.py
@@ -0,0 +1,25 @@
+import matplotlib.pyplot as plt
+import matplotlib as mpl
+import pandas as pd
+mpl.rcParams['font.size'] = 15.0
+
+
+labels = ['Astronomy', 'Other STEM', 'Not Software', 'Unknown', 'Software/Data Science']
+sizes = [51, 90, 65, 666, 443]
+#colors
+colors = ['#1F0322', '#8A1C7C', '#DA4167', '#F0BCD4', '#899D78']
+
+#ff9999','#66b3ff','#99ff99','#ffcc99', 'cyan']
+#explsion
+explode = (0.05,0.05,0.05,0.05,0.02)
+fig1, ax1 = plt.subplots(figsize=(15,12))
+
+ax1.pie(sizes, colors = colors, labels=labels, autopct='%1.1f%%',startangle=90, pctdistance=0.85, explode = explode)
+#draw circle
+centre_circle = plt.Circle((0,0),0.70,fc='white')
+fig = plt.gcf()
+fig.gca().add_artist(centre_circle)
+# Equal aspect ratio ensures that pie is drawn as a circle
+ax1.axis('equal')
+plt.tight_layout()
+plt.savefig('pie.png')
diff --git a/plasticc_classification_demo.ipynb b/plasticc_classification_demo.ipynb
index 1a92ccf..6716696 100644
--- a/plasticc_classification_demo.ipynb
+++ b/plasticc_classification_demo.ipynb
@@ -16,7 +16,9 @@
{
"cell_type": "code",
"execution_count": 1,
- "metadata": {},
+ "metadata": {
+ "collapsed": false
+ },
"outputs": [
{
"data": {
@@ -58,18 +60,11 @@
},
{
"cell_type": "code",
- "execution_count": 2,
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/Users/gnarayan/anaconda3/envs/plasticc/lib/python3.6/site-packages/sklearn/ensemble/weight_boosting.py:29: DeprecationWarning: numpy.core.umath_tests is an internal NumPy module and should not be imported. It will be removed in a future NumPy release.\n",
- " from numpy.core.umath_tests import inner1d\n"
- ]
- }
- ],
+ "execution_count": 5,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
"source": [
"%matplotlib notebook\n",
"import sys\n",
@@ -88,7 +83,8 @@
"from sklearn.decomposition import PCA\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.metrics import confusion_matrix\n",
- "import seaborn as sns"
+ "import seaborn as sns\n",
+ "import pandas as pd"
]
},
{
@@ -103,7 +99,9 @@
{
"cell_type": "code",
"execution_count": 3,
- "metadata": {},
+ "metadata": {
+ "collapsed": true
+ },
"outputs": [],
"source": [
"pbmap = OrderedDict([(0,'u'), (1,'g'), (2,'r'), (3,'i'), (4, 'z'), (5, 'y')])\n",
@@ -125,14 +123,16 @@
{
"cell_type": "code",
"execution_count": 4,
- "metadata": {},
+ "metadata": {
+ "collapsed": false
+ },
"outputs": [
{
"data": {
"text/html": [
"Table length=7848\n",
- "
\n",
- "| object_id | ra | decl | gall | galb | ddf_bool | hostgal_specz | hostgal_photoz | hostgal_photoz_err | distmod | mwebv | target |
\n",
+ "\n",
+ "| object_id | ra | decl | gal_l | gal_b | ddf | hostgal_specz | hostgal_photoz | hostgal_photoz_err | distmod | mwebv | target |
\n",
"| int64 | float64 | float64 | float64 | float64 | int64 | float64 | float64 | float64 | float64 | float64 | int64 |
\n",
"| 615 | 349.046051 | -61.943836 | 320.79653 | -51.753706 | 1 | 0.0 | 0.0 | 0.0 | nan | 0.017 | 92 |
\n",
"| 713 | 53.085938 | -27.784405 | 223.525509 | -54.460748 | 1 | 1.8181 | 1.6267 | 0.2552 | 45.4063 | 0.007 | 88 |
\n",
@@ -159,7 +159,7 @@
],
"text/plain": [
"\n",
- "object_id ra decl gall ... distmod mwebv target\n",
+ "object_id ra decl gal_l ... distmod mwebv target\n",
" int64 float64 float64 float64 ... float64 float64 int64 \n",
"--------- ---------- ---------- ---------- ... ------- ------- ------\n",
" 615 349.046051 -61.943836 320.79653 ... nan 0.017 92\n",
@@ -191,8 +191,8 @@
}
],
"source": [
- "datadir = 'data/'\n",
- "metafilename = f'{datadir}/plasticc_training_set_metadata.csv'\n",
+ "datadir = '/Users/reneehlozek/Data/plasticc'\n",
+ "metafilename = datadir+'/training_set_metadata.csv'\n",
"metadata = Table.read(metafilename, format='csv')\n",
"nobjects = len(metadata)\n",
"metadata"
@@ -217,8 +217,10 @@
},
{
"cell_type": "code",
- "execution_count": 5,
- "metadata": {},
+ "execution_count": 12,
+ "metadata": {
+ "collapsed": false
+ },
"outputs": [
{
"data": {
@@ -1003,7 +1005,7 @@
{
"data": {
"text/html": [
- "
"
+ "
"
],
"text/plain": [
""
@@ -1016,7 +1018,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
- "/Users/gnarayan/anaconda3/envs/plasticc/lib/python3.6/site-packages/scipy/stats/stats.py:1713: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.\n",
+ "//anaconda/envs/plasticc3/lib/python3.5/site-packages/scipy/stats/stats.py:1713: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.\n",
" return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval\n"
]
}
@@ -1045,8 +1047,10 @@
},
{
"cell_type": "code",
- "execution_count": 6,
- "metadata": {},
+ "execution_count": 13,
+ "metadata": {
+ "collapsed": false
+ },
"outputs": [
{
"data": {
@@ -1831,7 +1835,7 @@
{
"data": {
"text/html": [
- "
"
+ "
"
],
"text/plain": [
""
@@ -1873,15 +1877,17 @@
},
{
"cell_type": "code",
- "execution_count": 7,
- "metadata": {},
+ "execution_count": 16,
+ "metadata": {
+ "collapsed": false
+ },
"outputs": [
{
"data": {
"text/html": [
"Table length=1421705\n",
- "\n",
- "| object_id | mjd | passband | flux | flux_err | detected_bool |
\n",
+ "\n",
+ "| object_id | mjd | passband | flux | flux_err | detected |
\n",
"| int64 | float64 | int64 | float64 | float64 | int64 |
\n",
"| 615 | 59750.4229 | 2 | -544.810303 | 3.622952 | 1 |
\n",
"| 615 | 59750.4306 | 1 | -816.434326 | 5.55337 | 1 |
\n",
@@ -1908,39 +1914,39 @@
],
"text/plain": [
"\n",
- "object_id mjd passband flux flux_err detected_bool\n",
- " int64 float64 int64 float64 float64 int64 \n",
- "--------- ---------- -------- ------------ --------- -------------\n",
- " 615 59750.4229 2 -544.810303 3.622952 1\n",
- " 615 59750.4306 1 -816.434326 5.55337 1\n",
- " 615 59750.4383 3 -471.385529 3.801213 1\n",
- " 615 59750.445 4 -388.984985 11.395031 1\n",
- " 615 59752.407 2 -681.858887 4.041204 1\n",
- " 615 59752.4147 1 -1061.457031 6.472994 1\n",
- " 615 59752.4224 3 -524.95459 3.552751 1\n",
- " 615 59752.4334 4 -393.480225 3.599346 1\n",
- " 615 59752.4435 5 -355.88678 10.421921 1\n",
- " 615 59767.2968 2 -548.01355 3.462291 1\n",
- " ... ... ... ... ... ...\n",
- "130779836 60542.0489 4 -60.500492 14.743795 0\n",
- "130779836 60543.0247 4 -48.527161 24.00408 0\n",
- "130779836 60545.9844 5 32.006413 77.931732 0\n",
- "130779836 60546.9804 5 68.152985 56.351048 0\n",
- "130779836 60548.9789 4 -60.066154 34.353317 0\n",
- "130779836 60555.9838 4 -39.881969 46.477093 0\n",
- "130779836 60560.0459 1 14.894439 18.947685 0\n",
- "130779836 60571.0225 5 30.59313 50.69529 0\n",
- "130779836 60585.9974 4 -23.471439 44.819859 0\n",
- "130779836 60588.0372 0 -41.214264 51.665123 0"
+ "object_id mjd passband flux flux_err detected\n",
+ " int64 float64 int64 float64 float64 int64 \n",
+ "--------- ---------- -------- ------------ --------- --------\n",
+ " 615 59750.4229 2 -544.810303 3.622952 1\n",
+ " 615 59750.4306 1 -816.434326 5.55337 1\n",
+ " 615 59750.4383 3 -471.385529 3.801213 1\n",
+ " 615 59750.445 4 -388.984985 11.395031 1\n",
+ " 615 59752.407 2 -681.858887 4.041204 1\n",
+ " 615 59752.4147 1 -1061.457031 6.472994 1\n",
+ " 615 59752.4224 3 -524.95459 3.552751 1\n",
+ " 615 59752.4334 4 -393.480225 3.599346 1\n",
+ " 615 59752.4435 5 -355.88678 10.421921 1\n",
+ " 615 59767.2968 2 -548.01355 3.462291 1\n",
+ " ... ... ... ... ... ...\n",
+ "130779836 60542.0489 4 -60.500492 14.743795 0\n",
+ "130779836 60543.0247 4 -48.527161 24.00408 0\n",
+ "130779836 60545.9844 5 32.006413 77.931732 0\n",
+ "130779836 60546.9804 5 68.152985 56.351048 0\n",
+ "130779836 60548.9789 4 -60.066154 34.353317 0\n",
+ "130779836 60555.9838 4 -39.881969 46.477093 0\n",
+ "130779836 60560.0459 1 14.894439 18.947685 0\n",
+ "130779836 60571.0225 5 30.59313 50.69529 0\n",
+ "130779836 60585.9974 4 -23.471439 44.819859 0\n",
+ "130779836 60588.0372 0 -41.214264 51.665123 0"
]
},
- "execution_count": 7,
+ "execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "lcfilename = f'{datadir}/plasticc_training_set.csv.gz'\n",
+ "lcfilename = datadir+'/training_set.csv'\n",
"lcdata = Table.read(lcfilename, format='csv')\n",
"lcdata"
]
@@ -1954,13 +1960,15 @@
},
{
"cell_type": "code",
- "execution_count": 8,
- "metadata": {},
+ "execution_count": 17,
+ "metadata": {
+ "collapsed": false
+ },
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
- "model_id": "38742c4ea0264f4ebb100bb690ec1769",
+ "model_id": "79951c6db3224da69dc365516f005683",
"version_major": 2,
"version_minor": 0
},
@@ -2014,8 +2022,10 @@
},
{
"cell_type": "code",
- "execution_count": 9,
- "metadata": {},
+ "execution_count": 18,
+ "metadata": {
+ "collapsed": true
+ },
"outputs": [],
"source": [
"features_to_use = [\"amplitude\",\n",
@@ -2033,8 +2043,10 @@
},
{
"cell_type": "code",
- "execution_count": 10,
- "metadata": {},
+ "execution_count": 19,
+ "metadata": {
+ "collapsed": true
+ },
"outputs": [],
"source": [
"# we'll turn off warnings for a bit, because numpy can be whiny. \n",
@@ -2057,8 +2069,10 @@
},
{
"cell_type": "code",
- "execution_count": 11,
- "metadata": {},
+ "execution_count": 20,
+ "metadata": {
+ "collapsed": true
+ },
"outputs": [],
"source": [
"def worker(tsobj):\n",
@@ -2071,11 +2085,35 @@
},
{
"cell_type": "code",
- "execution_count": 12,
- "metadata": {},
- "outputs": [],
+ "execution_count": 21,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "58939229a39c4e05a2274b7a65251d16",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=0, description='Computing Features', max=7848), HTML(value='')))"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n"
+ ]
+ }
+ ],
"source": [
- "featurefile = f'{datadir}/plasticc_featuretable.npz'\n",
+ "featurefile = datadir + '/plasticc_featuretable.npz'\n",
"if os.path.exists(featurefile):\n",
" featuretable, _ = featurize.load_featureset(featurefile)\n",
"else:\n",
@@ -2101,8 +2139,10 @@
},
{
"cell_type": "code",
- "execution_count": 13,
- "metadata": {},
+ "execution_count": 22,
+ "metadata": {
+ "collapsed": true
+ },
"outputs": [],
"source": [
"old_names = featuretable.columns.values\n",
@@ -2123,8 +2163,10 @@
},
{
"cell_type": "code",
- "execution_count": 14,
- "metadata": {},
+ "execution_count": 23,
+ "metadata": {
+ "collapsed": true
+ },
"outputs": [],
"source": [
"splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=42)\n",
@@ -2143,8 +2185,10 @@
},
{
"cell_type": "code",
- "execution_count": 15,
- "metadata": {},
+ "execution_count": 24,
+ "metadata": {
+ "collapsed": false
+ },
"outputs": [
{
"data": {
@@ -2929,7 +2973,7 @@
{
"data": {
"text/html": [
- "
"
+ "
"
],
"text/plain": [
""
@@ -2963,8 +3007,10 @@
},
{
"cell_type": "code",
- "execution_count": 16,
- "metadata": {},
+ "execution_count": 25,
+ "metadata": {
+ "collapsed": true
+ },
"outputs": [],
"source": [
"Xtrain = np.array(allfeats[train_ind].as_array().tolist())\n",
@@ -2976,8 +3022,10 @@
},
{
"cell_type": "code",
- "execution_count": 17,
- "metadata": {},
+ "execution_count": 26,
+ "metadata": {
+ "collapsed": true
+ },
"outputs": [],
"source": [
"ncols = len(new_names)\n",
@@ -2986,8 +3034,10 @@
},
{
"cell_type": "code",
- "execution_count": 18,
- "metadata": {},
+ "execution_count": 27,
+ "metadata": {
+ "collapsed": true
+ },
"outputs": [],
"source": [
"pca = PCA(n_components=npca, whiten=True, svd_solver=\"full\", random_state=42)\n",
@@ -2997,8 +3047,10 @@
},
{
"cell_type": "code",
- "execution_count": 19,
- "metadata": {},
+ "execution_count": 28,
+ "metadata": {
+ "collapsed": false
+ },
"outputs": [
{
"data": {
@@ -3783,7 +3835,7 @@
{
"data": {
"text/html": [
- "
"
+ "
"
],
"text/plain": [
""
@@ -3814,8 +3866,10 @@
},
{
"cell_type": "code",
- "execution_count": 20,
- "metadata": {},
+ "execution_count": 29,
+ "metadata": {
+ "collapsed": true
+ },
"outputs": [],
"source": [
"clf = RandomForestClassifier(n_estimators=200, criterion='gini',\\\n",
@@ -3832,16 +3886,18 @@
},
{
"cell_type": "code",
- "execution_count": 21,
- "metadata": {},
+ "execution_count": 30,
+ "metadata": {
+ "collapsed": false
+ },
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
- "[Parallel(n_jobs=-1)]: Done 42 tasks | elapsed: 0.3s\n",
- "[Parallel(n_jobs=-1)]: Done 192 tasks | elapsed: 1.6s\n",
- "[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed: 1.6s finished\n",
+ "[Parallel(n_jobs=-1)]: Done 42 tasks | elapsed: 0.2s\n",
+ "[Parallel(n_jobs=-1)]: Done 192 tasks | elapsed: 0.9s\n",
+ "[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed: 0.9s finished\n",
"[Parallel(n_jobs=4)]: Done 42 tasks | elapsed: 0.0s\n",
"[Parallel(n_jobs=4)]: Done 192 tasks | elapsed: 0.1s\n",
"[Parallel(n_jobs=4)]: Done 200 out of 200 | elapsed: 0.1s finished\n"
@@ -3862,8 +3918,19 @@
},
{
"cell_type": "code",
- "execution_count": 22,
- "metadata": {},
+ "execution_count": null,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "metadata": {
+ "collapsed": true
+ },
"outputs": [],
"source": [
"cm = confusion_matrix(Ytest, Ypred, labels=labels)\n",
@@ -3880,8 +3947,35 @@
},
{
"cell_type": "code",
- "execution_count": 23,
- "metadata": {},
+ "execution_count": 7,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "prob = pd.read_csv('/Users/reneehlozek/Dropbox/First_10_submissions/1_Kyle.csv')\n",
+ "truth = pd.read_csv('/Users/reneehlozek/Dropbox/First_10_submissions/1_Kyle_truth.csv')\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "cm = confusion_matrix(truth, prob, labels=labels)\n",
+ "cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]\n",
+ "annot = np.around(cm, 2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
+ "metadata": {
+ "collapsed": false
+ },
"outputs": [
{
"data": {
@@ -4666,7 +4760,7 @@
{
"data": {
"text/html": [
- "
"
+ "
"
],
"text/plain": [
""
@@ -4684,6 +4778,15 @@
"ax.set_aspect('equal')"
]
},
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": []
+ },
{
"cell_type": "markdown",
"metadata": {},
@@ -4697,8 +4800,10 @@
},
{
"cell_type": "code",
- "execution_count": 24,
- "metadata": {},
+ "execution_count": 33,
+ "metadata": {
+ "collapsed": true
+ },
"outputs": [],
"source": [
"def plasticc_log_loss(y_true, y_pred, relative_class_weights=None):\n",
@@ -4725,8 +4830,10 @@
},
{
"cell_type": "code",
- "execution_count": 25,
- "metadata": {},
+ "execution_count": 34,
+ "metadata": {
+ "collapsed": false
+ },
"outputs": [
{
"name": "stdout",
@@ -4846,10 +4953,11 @@
}
],
"metadata": {
+ "anaconda-cloud": {},
"kernelspec": {
- "display_name": "Python (plasticc)",
+ "display_name": "Python [plasticc3]",
"language": "python",
- "name": "plasticc"
+ "name": "Python [plasticc3]"
},
"language_info": {
"codemirror_mode": {
@@ -4861,7 +4969,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.6.6"
+ "version": "3.5.6"
}
},
"nbformat": 4,