Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Start a new notebook to experiment with simhash and scikitlearn decision trees #1

Draft
wants to merge 3 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
289 changes: 289 additions & 0 deletions deduplicate_decision_tree.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,289 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "62890958-3971-4044-8633-b036aa220fa1",
"metadata": {},
"source": [
"# Deduplication decision tree experiment"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "d214df4c-3b52-45c5-b310-5290cd2cd48f",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"data_frame = pd.read_csv('science_with_duplicates.csv')\n",
"\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.tree import DecisionTreeClassifier"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "70faf456-cec0-4113-9ba7-26e506afa697",
"metadata": {},
"outputs": [],
"source": [
"X = data_frame.iloc[:,:-2]\n",
"y = data_frame.iloc[:,-2]\n",
"X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=0)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "7d9b44fd-c9d2-4ef0-9580-12610624611b",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Original id number</th>\n",
" <th>Our ID number</th>\n",
" <th>Title</th>\n",
" <th>Title (Original Script)</th>\n",
" <th>Author</th>\n",
" <th>Author (Original Script)</th>\n",
" <th>Format</th>\n",
" <th>Language</th>\n",
" <th>Published/Created</th>\n",
" <th>Date</th>\n",
" <th>Description</th>\n",
" <th>Series</th>\n",
" <th>Library</th>\n",
" <th>Location</th>\n",
" <th>Call Number</th>\n",
" <th>Notes</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>9921278523506421'</td>\n",
" <td>9921278523506420</td>\n",
" <td>Science : leçons théoriques et pratiques du ...</td>\n",
" <td>NaN</td>\n",
" <td>Alliette, 1738-1791</td>\n",
" <td>NaN</td>\n",
" <td>Book</td>\n",
" <td>French</td>\n",
" <td>Paris?: s.n.</td>\n",
" <td>1787.0</td>\n",
" <td>viij,94 p. ; 17 cm. (12mo)</td>\n",
" <td>NaN</td>\n",
" <td>Firestone Library</td>\n",
" <td>Stacks</td>\n",
" <td>6487.12.4</td>\n",
" <td>Caption title: Avant-propos des leçons théor...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>SCSB-4268029'</td>\n",
" <td>4268029</td>\n",
" <td>Science booklist</td>\n",
" <td>NaN</td>\n",
" <td>American Association for the Advancement of Sc...</td>\n",
" <td>NaN</td>\n",
" <td>Journal</td>\n",
" <td>English</td>\n",
" <td>Washington</td>\n",
" <td>1931.0</td>\n",
" <td>27 volumes in 1 : illustrations ; 22 x 10 cm; ...</td>\n",
" <td>NaN</td>\n",
" <td>ReCAP</td>\n",
" <td>Remote Storage</td>\n",
" <td>017 Am3</td>\n",
" <td>No more published.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>SCSB-8478007'</td>\n",
" <td>8478007</td>\n",
" <td>Science bulletin</td>\n",
" <td>NaN</td>\n",
" <td>American-Soviet Science Society</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>English</td>\n",
" <td>New York</td>\n",
" <td>NaN</td>\n",
" <td>28 cm.</td>\n",
" <td>NaN</td>\n",
" <td>ReCAP</td>\n",
" <td>Remote Storage</td>\n",
" <td>JSP 80-103</td>\n",
" <td>Irregular.; Issues for Feb.-Apr. 1945 publishe...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>9941093333506421'</td>\n",
" <td>9959461</td>\n",
" <td>Science : 100 scientists who changed the world</td>\n",
" <td>NaN</td>\n",
" <td>Balchin, Jon</td>\n",
" <td>NaN</td>\n",
" <td>Book</td>\n",
" <td>English</td>\n",
" <td>New York: Enchanted Lion</td>\n",
" <td>2003.0</td>\n",
" <td>208 p. : ill. ; 24 cm.</td>\n",
" <td>NaN</td>\n",
" <td>Harold P. Furth Plasma Physics Library</td>\n",
" <td>Stacks</td>\n",
" <td>Q125 .B377 2003</td>\n",
" <td>Includes index.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>SCSB-9959461'</td>\n",
" <td>9959461</td>\n",
" <td>Science : 100 scientists who changed the world</td>\n",
" <td>NaN</td>\n",
" <td>Balchin, Jon</td>\n",
" <td>NaN</td>\n",
" <td>Book</td>\n",
" <td>English</td>\n",
" <td>New York: Enchanted Lion</td>\n",
" <td>2003.0</td>\n",
" <td>208 p. : ports. ; 24 cm.</td>\n",
" <td>NaN</td>\n",
" <td>ReCAP</td>\n",
" <td>Remote Storage</td>\n",
" <td>Q125 .B377 2003</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Original id number Our ID number \\\n",
"0 9921278523506421' 9921278523506420 \n",
"1 SCSB-4268029' 4268029 \n",
"2 SCSB-8478007' 8478007 \n",
"3 9941093333506421' 9959461 \n",
"4 SCSB-9959461' 9959461 \n",
"\n",
" Title Title (Original Script) \\\n",
"0 Science : leçons théoriques et pratiques du ... NaN \n",
"1 Science booklist NaN \n",
"2 Science bulletin NaN \n",
"3 Science : 100 scientists who changed the world NaN \n",
"4 Science : 100 scientists who changed the world NaN \n",
"\n",
" Author \\\n",
"0 Alliette, 1738-1791 \n",
"1 American Association for the Advancement of Sc... \n",
"2 American-Soviet Science Society \n",
"3 Balchin, Jon \n",
"4 Balchin, Jon \n",
"\n",
" Author (Original Script) Format Language Published/Created \\\n",
"0 NaN Book French Paris?: s.n. \n",
"1 NaN Journal English Washington \n",
"2 NaN NaN English New York \n",
"3 NaN Book English New York: Enchanted Lion \n",
"4 NaN Book English New York: Enchanted Lion \n",
"\n",
" Date Description Series \\\n",
"0 1787.0 viij,94 p. ; 17 cm. (12mo) NaN \n",
"1 1931.0 27 volumes in 1 : illustrations ; 22 x 10 cm; ... NaN \n",
"2 NaN 28 cm. NaN \n",
"3 2003.0 208 p. : ill. ; 24 cm. NaN \n",
"4 2003.0 208 p. : ports. ; 24 cm. NaN \n",
"\n",
" Library Location Call Number \\\n",
"0 Firestone Library Stacks 6487.12.4 \n",
"1 ReCAP Remote Storage 017 Am3 \n",
"2 ReCAP Remote Storage JSP 80-103 \n",
"3 Harold P. Furth Plasma Physics Library Stacks Q125 .B377 2003 \n",
"4 ReCAP Remote Storage Q125 .B377 2003 \n",
"\n",
" Notes \n",
"0 Caption title: Avant-propos des leçons théor... \n",
"1 No more published. \n",
"2 Irregular.; Issues for Feb.-Apr. 1945 publishe... \n",
"3 Includes index. \n",
"4 NaN "
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data_frame.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b0f2a875-3bbf-4242-b88e-8de2d7003188",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "861c3ec6-28b6-434f-91cd-526bd4529e08",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "d6a35026-524f-4f3f-b823-52d9877afff6",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.0"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Loading