diff --git a/Klustering algo/Mall_Customers.csv b/Klustering algo/Mall_Customers.csv new file mode 100644 index 0000000..b324941 --- /dev/null +++ b/Klustering algo/Mall_Customers.csv @@ -0,0 +1,201 @@ +CustomerID,Genre,Age,Annual Income (k$),Spending Score (1-100) +0001,Male,19,15,39 +0002,Male,21,15,81 +0003,Female,20,16,6 +0004,Female,23,16,77 +0005,Female,31,17,40 +0006,Female,22,17,76 +0007,Female,35,18,6 +0008,Female,23,18,94 +0009,Male,64,19,3 +0010,Female,30,19,72 +0011,Male,67,19,14 +0012,Female,35,19,99 +0013,Female,58,20,15 +0014,Female,24,20,77 +0015,Male,37,20,13 +0016,Male,22,20,79 +0017,Female,35,21,35 +0018,Male,20,21,66 +0019,Male,52,23,29 +0020,Female,35,23,98 +0021,Male,35,24,35 +0022,Male,25,24,73 +0023,Female,46,25,5 +0024,Male,31,25,73 +0025,Female,54,28,14 +0026,Male,29,28,82 +0027,Female,45,28,32 +0028,Male,35,28,61 +0029,Female,40,29,31 +0030,Female,23,29,87 +0031,Male,60,30,4 +0032,Female,21,30,73 +0033,Male,53,33,4 +0034,Male,18,33,92 +0035,Female,49,33,14 +0036,Female,21,33,81 +0037,Female,42,34,17 +0038,Female,30,34,73 +0039,Female,36,37,26 +0040,Female,20,37,75 +0041,Female,65,38,35 +0042,Male,24,38,92 +0043,Male,48,39,36 +0044,Female,31,39,61 +0045,Female,49,39,28 +0046,Female,24,39,65 +0047,Female,50,40,55 +0048,Female,27,40,47 +0049,Female,29,40,42 +0050,Female,31,40,42 +0051,Female,49,42,52 +0052,Male,33,42,60 +0053,Female,31,43,54 +0054,Male,59,43,60 +0055,Female,50,43,45 +0056,Male,47,43,41 +0057,Female,51,44,50 +0058,Male,69,44,46 +0059,Female,27,46,51 +0060,Male,53,46,46 +0061,Male,70,46,56 +0062,Male,19,46,55 +0063,Female,67,47,52 +0064,Female,54,47,59 +0065,Male,63,48,51 +0066,Male,18,48,59 +0067,Female,43,48,50 +0068,Female,68,48,48 +0069,Male,19,48,59 +0070,Female,32,48,47 +0071,Male,70,49,55 +0072,Female,47,49,42 +0073,Female,60,50,49 +0074,Female,60,50,56 +0075,Male,59,54,47 +0076,Male,26,54,54 +0077,Female,45,54,53 +0078,Male,40,54,48 +0079,Female,23,54,52 +0080,Female,49,54,42 +0081,Male,57,54,51 +0082,Male,38,54,55 +0083,Male,67,54,41 +0084,Female,46,54,44 +0085,Female,21,54,57 +0086,Male,48,54,46 +0087,Female,55,57,58 +0088,Female,22,57,55 +0089,Female,34,58,60 +0090,Female,50,58,46 +0091,Female,68,59,55 +0092,Male,18,59,41 +0093,Male,48,60,49 +0094,Female,40,60,40 +0095,Female,32,60,42 +0096,Male,24,60,52 +0097,Female,47,60,47 +0098,Female,27,60,50 +0099,Male,48,61,42 +0100,Male,20,61,49 +0101,Female,23,62,41 +0102,Female,49,62,48 +0103,Male,67,62,59 +0104,Male,26,62,55 +0105,Male,49,62,56 +0106,Female,21,62,42 +0107,Female,66,63,50 +0108,Male,54,63,46 +0109,Male,68,63,43 +0110,Male,66,63,48 +0111,Male,65,63,52 +0112,Female,19,63,54 +0113,Female,38,64,42 +0114,Male,19,64,46 +0115,Female,18,65,48 +0116,Female,19,65,50 +0117,Female,63,65,43 +0118,Female,49,65,59 +0119,Female,51,67,43 +0120,Female,50,67,57 +0121,Male,27,67,56 +0122,Female,38,67,40 +0123,Female,40,69,58 +0124,Male,39,69,91 +0125,Female,23,70,29 +0126,Female,31,70,77 +0127,Male,43,71,35 +0128,Male,40,71,95 +0129,Male,59,71,11 +0130,Male,38,71,75 +0131,Male,47,71,9 +0132,Male,39,71,75 +0133,Female,25,72,34 +0134,Female,31,72,71 +0135,Male,20,73,5 +0136,Female,29,73,88 +0137,Female,44,73,7 +0138,Male,32,73,73 +0139,Male,19,74,10 +0140,Female,35,74,72 +0141,Female,57,75,5 +0142,Male,32,75,93 +0143,Female,28,76,40 +0144,Female,32,76,87 +0145,Male,25,77,12 +0146,Male,28,77,97 +0147,Male,48,77,36 +0148,Female,32,77,74 +0149,Female,34,78,22 +0150,Male,34,78,90 +0151,Male,43,78,17 +0152,Male,39,78,88 +0153,Female,44,78,20 +0154,Female,38,78,76 +0155,Female,47,78,16 +0156,Female,27,78,89 +0157,Male,37,78,1 +0158,Female,30,78,78 +0159,Male,34,78,1 +0160,Female,30,78,73 +0161,Female,56,79,35 +0162,Female,29,79,83 +0163,Male,19,81,5 +0164,Female,31,81,93 +0165,Male,50,85,26 +0166,Female,36,85,75 +0167,Male,42,86,20 +0168,Female,33,86,95 +0169,Female,36,87,27 +0170,Male,32,87,63 +0171,Male,40,87,13 +0172,Male,28,87,75 +0173,Male,36,87,10 +0174,Male,36,87,92 +0175,Female,52,88,13 +0176,Female,30,88,86 +0177,Male,58,88,15 +0178,Male,27,88,69 +0179,Male,59,93,14 +0180,Male,35,93,90 +0181,Female,37,97,32 +0182,Female,32,97,86 +0183,Male,46,98,15 +0184,Female,29,98,88 +0185,Female,41,99,39 +0186,Male,30,99,97 +0187,Female,54,101,24 +0188,Male,28,101,68 +0189,Female,41,103,17 +0190,Female,36,103,85 +0191,Female,34,103,23 +0192,Female,32,103,69 +0193,Male,33,113,8 +0194,Female,38,113,91 +0195,Female,47,120,16 +0196,Female,35,120,79 +0197,Female,45,126,28 +0198,Male,32,126,74 +0199,Male,32,137,18 +0200,Male,30,137,83 \ No newline at end of file diff --git a/Klustering algo/dendrogrampy.png b/Klustering algo/dendrogrampy.png new file mode 100644 index 0000000..a6d1f6e Binary files /dev/null and b/Klustering algo/dendrogrampy.png differ diff --git a/Klustering algo/dendrogramrplot.png b/Klustering algo/dendrogramrplot.png new file mode 100644 index 0000000..ecceddc Binary files /dev/null and b/Klustering algo/dendrogramrplot.png differ diff --git a/Klustering algo/elbowmethodpyplot.png b/Klustering algo/elbowmethodpyplot.png new file mode 100644 index 0000000..8e39e73 Binary files /dev/null and b/Klustering algo/elbowmethodpyplot.png differ diff --git a/Klustering algo/elbowmethodrplot.png b/Klustering algo/elbowmethodrplot.png new file mode 100644 index 0000000..b516fed Binary files /dev/null and b/Klustering algo/elbowmethodrplot.png differ diff --git a/Klustering algo/hc.py b/Klustering algo/hc.py new file mode 100644 index 0000000..003a37b --- /dev/null +++ b/Klustering algo/hc.py @@ -0,0 +1,49 @@ + """ +Created on Fri Mar 31 21:41:34 2017 + +@author: Robert +""" + +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt +plt.style.use('seaborn-deep') +import matplotlib.cm +cmap = matplotlib.cm.get_cmap('plasma') + +# Reading in data +ds = pd.read_csv('Mall_Customers.csv') +X = ds.iloc[:, [3,4]].values + +# Dendrogram to choose number of clusters (k) +import scipy.cluster.hierarchy as sch + +plt.figure(1) +z = sch.linkage(X, method = 'ward') +dendrogram = sch.dendrogram(z) +plt.title('Dendrogram') +plt.xlabel('Customers') +plt.ylabel('Euclidean distances') +plt.show() + +k = 5 + +# Clustering +from sklearn.cluster import AgglomerativeClustering + +hc = AgglomerativeClustering(n_clusters = k, affinity = "euclidean", + linkage = 'ward') +y_hc = hc.fit_predict(X) + +labels = [('Cluster ' + str(i+1)) for i in range(k)] + +plt.figure(2) +for i in range(k): + plt.scatter(X[y_hc == i, 0], X[y_hc == i, 1], s = 20, + c = cmap(i/k), label = labels[i]) +plt.xlabel('Age') +plt.ylabel('Spending score') +plt.title('HC cluster plot') +plt.legend() +plt.show() + \ No newline at end of file diff --git a/Klustering algo/hc.r b/Klustering algo/hc.r new file mode 100644 index 0000000..3e78246 --- /dev/null +++ b/Klustering algo/hc.r @@ -0,0 +1,17 @@ + +# Reading in data +ds = read.csv('Mall_Customers.csv') +X = ds[,4:5] + +# Creating dendrogram to choose k +hc = hclust(dist(X, method = "euclidean"), method = "ward.D") + +plot(hc, labels = FALSE, hang = 0.03, + main = paste("Cluster Dendrogram"), + xlab = 'Customers', + ylab = "Euclidean distance") + +# Clustering +y_hc = cutree(hc, 5) + +plot(X, col = y_hc) diff --git a/Klustering algo/hcpyplot.png b/Klustering algo/hcpyplot.png new file mode 100644 index 0000000..a9fb755 Binary files /dev/null and b/Klustering algo/hcpyplot.png differ diff --git a/Klustering algo/kmeans.py b/Klustering algo/kmeans.py new file mode 100644 index 0000000..147e7b6 --- /dev/null +++ b/Klustering algo/kmeans.py @@ -0,0 +1,54 @@ +""" +Created on Wed Mar 29 21:42:38 2017 + +@author: Robert +""" + +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt +plt.style.use('seaborn-deep') +from sklearn.cluster import KMeans +import matplotlib.cm +cmap = matplotlib.cm.get_cmap('plasma') + + +ds = pd.read_csv('Mall_Customers.csv') +X = ds.iloc[:, [3,4]].values + +# Choosing the value of k by the elbow method +wcss = [] + +for i in range(1,21): + kmeans = KMeans(n_clusters=i) + kmeans.fit_transform(X) + wcss.append(kmeans.inertia_) + +plt.figure() +plt.plot(range(1,21), wcss) +plt.title('The Elbow Method') +plt.xlabel('Number of clusters') +plt.ylabel('WCSS') +plt.show() + +# Clustering the data +k = 5 +kmeans = KMeans(n_clusters = k) +y_kmeans = kmeans.fit_predict(X) + +labels = [('Cluster ' + str(i+1)) for i in range(k)] + +# Plotting the clusters +plt.figure() +for i in range(k): + plt.scatter(X[y_kmeans == i, 0], X[y_kmeans == i, 1], s = 20, + c = cmap(i/k), label = labels[i]) + +plt.scatter(kmeans.cluster_centers_[:,0], kmeans.cluster_centers_[:,1], + s = 100, c = 'black', label = 'Centroids', marker = 'X') +plt.xlabel('Age') +plt.ylabel('Spending score') +plt.title('Kmeans cluster plot') +plt.legend() +plt.show() + \ No newline at end of file diff --git a/Klustering algo/kmeans.r b/Klustering algo/kmeans.r new file mode 100644 index 0000000..baf7f2a --- /dev/null +++ b/Klustering algo/kmeans.r @@ -0,0 +1,19 @@ + +# Reading in data +ds = read.csv('Mall_Customers.csv') +X = ds[4:5] + +# Finding k +wcss = vector() +for (i in 1:10) + wcss[i] =sum(kmeans(X, i)$withinss) + +plot(1:10, wcss, type = 'b', main=paste("Elbow method"), xlab = 'number clusters' ) + +# Clustering +kmeans = kmeans(X, 5) +y_kmeans = kmeans$cluster + +# Visualising the clusters +plot(X, col = y_kmeans) +points(kmeans$center,col=1:2,pch=8,cex=1) diff --git a/Klustering algo/kmeans3D.py b/Klustering algo/kmeans3D.py new file mode 100644 index 0000000..9f66300 --- /dev/null +++ b/Klustering algo/kmeans3D.py @@ -0,0 +1,55 @@ +""" +Created on Wed Mar 29 21:42:38 2017 + +@author: Robert +""" + +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt +from sklearn.cluster import KMeans +from mpl_toolkits.mplot3d import Axes3D + +ds = pd.read_csv('Mall_Customers.csv') +X = ds.iloc[:, 2:5].values + +# Choosing the value of k +wcss = [] + +for i in range(1,21): + kmeans = KMeans(n_clusters=i) + kmeans.fit_predict(X) + wcss.append(kmeans.inertia_) + +plt.figure(1) +plt.plot(range(1,21), wcss) +plt.title('The Elbow Method') +plt.xlabel('Number of clusters') +plt.ylabel('WCSS') +plt.show() + +k = 6 + +# Clustering +kmeans = KMeans(n_clusters = k) +y_kmeans = kmeans.fit(X) + +labels = y_kmeans.labels_ + +# Making the 3D plot +fig = plt.figure() +ax = Axes3D(fig) + +ax.scatter(X[:,0], X[:,1], X[:,2], c = labels.astype(np.float)) +ax.scatter(kmeans.cluster_centers_[:,0], kmeans.cluster_centers_[:,1], + kmeans.cluster_centers_[:,2], s = 100, c = 'black', + label = 'Centroids', marker = 'X' ) +ax.set_xlabel('Age') +ax.set_ylabel('Annual Income') +ax.set_zlabel('Spending score') +plt.title('Kmeans cluster plot') +plt.legend() +plt.show() + + + \ No newline at end of file diff --git a/Klustering algo/kmeans3dplot.png b/Klustering algo/kmeans3dplot.png new file mode 100644 index 0000000..f484b7a Binary files /dev/null and b/Klustering algo/kmeans3dplot.png differ diff --git a/Klustering algo/kmeanspyplot.png b/Klustering algo/kmeanspyplot.png new file mode 100644 index 0000000..b974da0 Binary files /dev/null and b/Klustering algo/kmeanspyplot.png differ