Skip to content

Commit 2e9038d

Browse files
committed
more analyses
1 parent 285d12d commit 2e9038d

14 files changed

+640
-80
lines changed

.run/8) Rank using Salient Directions best siddata2022.run.xml

+7-3
Original file line numberDiff line numberDiff line change
@@ -7,16 +7,20 @@
77
<env name="PYTHONUNBUFFERED" value="1" />
88
<env name="MA_DATASET" value="siddata2022" />
99
<env name="MA_LANGUAGE" value="de" />
10-
<env name="MA_DEBUG" value="1" />
11-
<env name="MA_PP_COMPONENTS" value="mfacsd2" />
10+
<env name="MA_DEBUG" value="False" />
11+
<env name="MA_PP_COMPONENTS" value="mfauhcsd2" />
1212
<env name="MA_TRANSLATE_POLICY" value="onlyorig" />
1313
<env name="MA_MIN_WORDS_PER_DESC" value="80" />
1414
<env name="MA_QUANTIFICATION_MEASURE" value="tfidf" />
1515
<env name="MA_EMBED_ALGO" value="mds" />
1616
<env name="MA_EMBED_DIMENSIONS" value="200" />
1717
<env name="MA_EXTRACTION_METHOD" value="tfidf" />
1818
<env name="MA_DCM_QUANT_MEASURE" value="count" />
19-
<env name="MA_CLASSIFIER_COMPARETO_RANKING" value="count" />
19+
<env name="MA_KAPPA_WEIGHTS" value="quadratic" />
20+
<env name="MA_CLASSIFIER_SUCCMETRIC" value="kappa_digitized_onlypos_2" />
21+
<env name="MA_PRIM_LAMBDA" value="0.5" />
22+
<env name="MA_SEC_LAMBDA" value="0.2" />
23+
<env name="MA_CLUSTER_DIRECTION_ALGO" value="reclassify" />
2024
</envs>
2125
<option name="SDK_HOME" value="" />
2226
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/derive_conceptualspace/cli" />

config/derrac2015_edited.yml

+2-2
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ translate_policy: onlyorig
99
quantification_measure: [ppmi, tfidf]
1010
dissim_measure: norm_ang_dist
1111
embed_algo: mds
12-
embed_dimensions: [3, 50, 200]
12+
embed_dimensions: [3, 50, 100, 200]
1313
extraction_method: tfidf
1414
#TODO: DESC15 hat darauf abgezielt ~22k keywords zu haben => mit meinen params dafür sorgen dass ich auch auf sowas komme, mit den aktuellen ists nur 2402!
1515
#candidate_min_term_count: 25 #movies has samples-to-threshold value of 100, placetypes has 35, 20newsgrups has 614, so for 8000 courses any threshold from 2 to 25 seems reasonable (BUT see above, I get too little)!!
@@ -21,7 +21,7 @@ classifier: SVM
2121
kappa_weights: quadratic
2222
classifier_succmetric: [kappa_rank2rank_onlypos_min, kappa_digitized_onlypos_2, kappa_count2rank_onlypos]
2323
prim_lambda: 0.5
24-
sec_lambda: 0.2
24+
sec_lambda: 0.1
2525
__perdataset__:
2626
placetypes:
2727
extraction_method: all #in the placetypes-dataset, ALL words are candidates (21.8k)

derive_conceptualspace/_version.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "1.1.20220225"
1+
__version__ = "1.2.20220407"

derive_conceptualspace/cli/run_pipeline.py

+60-1
Original file line numberDiff line numberDiff line change
@@ -383,14 +383,73 @@ def decision_trees(ctx):
383383

384384
@generate_conceptualspace.command()
385385
@click_pass_add_context
386-
def rank_saldirs(ctx):
386+
def rank_saldirs_DEPRECATED(ctx):
387387
ctx.obj["pp_descriptions"] = ctx.p.load(None, "pp_descriptions", loader=DescriptionList.from_json, silent=True) #TODO really silent?
388388
ctx.obj["featureaxes"] = ctx.p.load(None, "featureaxes", loader=featureaxes_loader)
389389
ctx.obj["clusters"] = ctx.p.load(None, "clusters")
390390
#TODO this should rather contain the code from run_pipeline.decision_trees
391391
rank_saldirs_base(ctx.obj["pp_descriptions"], ctx.obj["embedding"], ctx.obj["featureaxes"], ctx.obj["filtered_dcm"],
392392
prim_lambda=ctx.get_config("prim_lambda"), sec_lambda=ctx.get_config("sec_lambda"), metricname=ctx.get_config("classifier_succmetric"))
393393

394+
#TODO move me!
395+
import numpy as np
396+
def get_decisions(X_test, clf, catnames, axnames):
397+
n_nodes = clf.tree_.node_count
398+
children_left = clf.tree_.children_left
399+
children_right = clf.tree_.children_right
400+
classes = [catnames[clf.classes_[np.argmax(i)]] for i in clf.tree_.value]
401+
node_depth = np.zeros(shape=n_nodes, dtype=np.int64)
402+
is_leaves = np.zeros(shape=n_nodes, dtype=bool)
403+
stack = [(0, 0)] # start with the root node id (0) and its depth (0)
404+
while len(stack) > 0:
405+
# `pop` ensures each node is only visited once
406+
node_id, depth = stack.pop()
407+
node_depth[node_id] = depth
408+
# If the left and right child of a node is not the same we have a split node
409+
is_split_node = children_left[node_id] != children_right[node_id]
410+
# If a split node, append left and right children and depth to `stack` so we can loop through them
411+
if is_split_node:
412+
stack.append((children_left[node_id], depth + 1))
413+
stack.append((children_right[node_id], depth + 1))
414+
else:
415+
is_leaves[node_id] = True
416+
alls = {}
417+
for i in range(n_nodes):
418+
if not is_leaves[i]:
419+
alls.setdefault(node_depth[i], []).append((axnames[clf.tree_.feature[i]], clf.tree_.threshold[i]))
420+
return (alls[0]+alls[1]) if len(alls) > 1 else alls[0]
421+
422+
@generate_conceptualspace.command()
423+
@click_pass_add_context
424+
def rank_saldirs(ctx):
425+
from derive_conceptualspace.semantic_directions.cluster_names import get_name_dict
426+
from tqdm import tqdm
427+
import pandas as pd
428+
from derive_conceptualspace.evaluate.shallow_trees import classify_shallowtree
429+
clusters = ctx.obj["clusters"] = ctx.p.load(None, "clusters", loader=cluster_loader)
430+
cluster_reprs = ctx.obj["cluster_reprs"] = ctx.p.load(None, "cluster_reprs")
431+
embedding = ctx.obj["embedding"] = ctx.p.load(None, "embedding")
432+
descriptions = ctx.obj["pp_descriptions"] = ctx.p.load(None, "pp_descriptions", loader=DescriptionList.from_json, silent=True) #TODO really silent?
433+
embedding = embedding["embedding"].embedding_
434+
435+
clus_rep_algo = "top_1" #TODO obvs from config
436+
cluster_names = get_name_dict(clusters["clusters"], cluster_reprs, clus_rep_algo)
437+
#first I want the distances to the origins of the respective dimensions (induced by the clusters), what induces the respective rankings (see DESC15 p.24u, proj2 of load_semanticspaces.load_projections)
438+
axis_dists = {i: {k: v.dist(embedding[i]) for k, v in clusters["directions"].items()} for i in tqdm(range(len(embedding)))}
439+
df = pd.DataFrame(axis_dists).T
440+
best_per_dim = {k: descriptions._descriptions[v].title for k, v in df.idxmax().to_dict().items()}
441+
442+
print("Highest-ranking descriptions per dimension:\n "+"\n ".join([f"*b*{cluster_names[k].rjust(max([len(cluster_names[i]) for i in best_per_dim.keys()][:20]))}*b*: {v}" for k, v in best_per_dim.items()][:20]))
443+
#TODO also show places 2, 3, 4 - hier sehen wir wieder sehr ähnliche ("football stadium", "stadium", "fan" for "goalie")
444+
#TODO axis_dists is all I need for the movietuner already!! I can say "give me something like X, only with more Y"
445+
tr = classify_shallowtree(clusters, embedding, descriptions, ctx.obj["dataset_class"], one_vs_rest=True, dt_depth=1, test_percentage_crossval=0.33,
446+
classes="fachbereich", cluster_reprs=cluster_reprs, verbose=False, return_features=True, balance_classes=True, do_plot=False)
447+
important_directions = [get_decisions(embedding, t, [i[1] for i in tr[4]], tr[-1])[0][0] for t in tr[0]]
448+
best_importants = {f"{cluster_names[i]} ({j[1]})": best_per_dim[i] for i,j in zip(important_directions, tr[4])}
449+
print("Highest-ranking descriptions per important dimension:\n " + "\n ".join(
450+
[f"*b*{k.rjust(max([len(i) for i in best_importants.keys()]))}*b*: {v}" for k, v in best_importants.items()]))
451+
452+
394453

395454
@cli.command()
396455
@click_pass_add_context

0 commit comments

Comments
 (0)