Skip to content

Commit 7b02036

Browse files
committed
fix clustering issue and add testing scripts
1 parent abe9b41 commit 7b02036

13 files changed

+101
-1
lines changed

language/mlsql/mlsql/functions/dataflow.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ def _model_phase(keywords, filename, header, sep, train, predictors, label, algo
7878
from .keywords.replace_functions import handle_replace
7979
df = handle_replace(df, [replace])
8080
pass
81-
81+
8282
# Encode all categorical values
8383
df = encode_categorical(df)
8484

language/mlsql/mlsql/functions/keywords/cluster_functions.py

+2
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ def handle_cluster(data, algorithm, preds, label = None, clusters = 3, split = F
1111
"""
1212
model = handle_cluster_algorithm(algorithm)
1313
if model is not None:
14+
if clusters is '':
15+
clusters = '3'
1416
model.n_clusters = int(clusters)
1517

1618
#convert list of columns to integers and covert columns to start at 0

language/mlsql/mlsql/test/get_data.py

+37
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
import os
2+
import requests
3+
4+
path = 'data'
5+
6+
NAMES = {
7+
''
8+
9+
}
10+
DATASETS = (
11+
'https://raw.githubusercontent.com/mxhao2/MLSQL_DataSets/master/auto.csv',
12+
'https://raw.githubusercontent.com/mxhao2/MLSQL_DataSets/master/boston.csv',
13+
'https://raw.githubusercontent.com/mxhao2/MLSQL_DataSets/master/census.csv',
14+
'https://raw.githubusercontent.com/mxhao2/MLSQL_DataSets/master/chronic.csv',
15+
'https://raw.githubusercontent.com/mxhao2/MLSQL_DataSets/master/computer.csv',
16+
'https://raw.githubusercontent.com/mxhao2/MLSQL_DataSets/master/iris.csv',
17+
'https://raw.githubusercontent.com/mxhao2/MLSQL_DataSets/master/seeds.csv',
18+
'https://raw.githubusercontent.com/mxhao2/MLSQL_DataSets/master/spam.csv',
19+
'https://raw.githubusercontent.com/mxhao2/MLSQL_DataSets/master/train.csv',
20+
'https://raw.githubusercontent.com/mxhao2/MLSQL_DataSets/master/wine.csv',
21+
22+
23+
24+
)
25+
26+
def download_data(path , urls = DATASETS):
27+
if not os.path.exists(path):
28+
os.mkdir(path)
29+
30+
for url in urls:
31+
response = requests.get(url)
32+
name = os.path.basename(url)
33+
with open(os.path.join(path, name), 'wb') as f:
34+
f.write(response.content)
35+
36+
37+
download_data('data')
+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
import mlsql
2+
from mlsql import execute
3+
4+
query = 'READ "data/auto.csv" (separator = "\s+", header = 0) REPLACE ("?", mean) SPLIT (train = .8, test = .2, validation = .0) REGRESS (predictors = [2,3,4,5,6,7,8], label = 1, algorithm = simple)'
5+
6+
execute(query)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
import mlsql
2+
from mlsql import execute
3+
4+
query = 'READ "data/boston.csv" (separator = "\s+", header = 0) SPLIT (train = .8, test = .2, validation = .0) REGRESS (predictors = [1,2,3,4,5,6,7,8,9,10,11,12,13], label = 14, algorithm = elastic)'
5+
6+
execute(query)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
import mlsql
2+
from mlsql import execute
3+
4+
query = 'READ "data/census.csv" (separator = ",", header = 0) REPLACE ("NaN", "mode") SPLIT (train = .8, test = 0.2) CLASSIFY (predictors = [1,2,3,4,5,6,7,8,9,10,11,12,13,14], label = 15, algorithm = logistic)'
5+
execute(query)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
import mlsql
2+
from mlsql import execute
3+
4+
print("Chronic Kidney Disease Logistic Regression")
5+
query = 'READ "data/chronic.csv" SPLIT (train = .8, test = 0.2) CLASSIFY (predictors = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24], label = 25, algorithm = logistic)'
6+
7+
execute(query)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
import mlsql
2+
from mlsql import execute
3+
4+
query = 'READ "data/computer.csv" (separator = ",", header = 0) SPLIT (train = .8, test = .2, validation = .0) REGRESS (predictors = [1,2,3,4,5,6,7,8,9], label = 10, algorithm = ridge)'
5+
6+
execute(query)
+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
import mlsql
2+
from mlsql import execute
3+
4+
query = 'READ "data/iris.csv" SPLIT (train = .8, test = 0.2) CLASSIFY (predictors = [1,2,3,4], label = 5, algorithm = svm)'
5+
6+
execute(query)
+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
import mlsql
2+
from mlsql import execute
3+
4+
query = 'READ "data/seeds.csv" (separator = "\s+", header = 0) SPLIT (train = .8, test = .2, validation = .0) CLUSTER (predictors = [1,2,3,4,5,6,7], algorithm = kmeans)'
5+
6+
execute(query)
+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
import mlsql
2+
from mlsql import execute
3+
4+
query = 'READ "data/spam.csv" SPLIT (train = .8, test = 0.2) CLASSIFY (predictors = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56], label = 58, algorithm = bayes)'
5+
6+
execute(query)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
import mlsql
2+
from mlsql import execute
3+
4+
query = 'READ "data/train.csv" (separator = ",", header = 0) REPLACE ("NaN", "mode") SPLIT (train = .8, test = 0.2) CLASSIFY (predictors = [1,3,4,5,6,7,8,9,10,11,12], label = 2, algorithm = forest)'
5+
execute(query)
+8
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
import mlsql
2+
from mlsql import execute
3+
4+
5+
query = 'READ "data/wine.csv" (separator = ";", header = 0) SPLIT (train = .8, test = 0.2) CLASSIFY (predictors = [1,2,3,4,5,6,7,8,9,10,11], label = 12, algorithm = knn)'
6+
7+
8+
execute(query)

0 commit comments

Comments
 (0)