K-means clustering

#import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#create the dataframe
df=pd.read_csv("D:\income.csv")
print(df)
k-means-clustering-in-unsupervised-ml-img
plt.scatter(df['Age'],df['Income'])
plt.xlabel('Age')
plt.ylabel('Income')
plt.show()
k-means-clustering-in-unsupervised-ml-img2
#Create Kmeans clustering model
from sklearn.cluster import KMeans
km=KMeans(n_clusters=3)
y_predicted=km.fit_predict(df[['Age','Income']])
print(y_predicted)

[2 2 0 0 1 1 1 1 1 1 1 0 0 0 0]


df['Cluster']=y_predicted
df

Output:

k-means-clustering-in-unsupervised-ml-img3
#To find the co-ordinates of each cluster
print(km.cluster_centers_)

[[2.78333333e+01 5.24166667e+04]
[3.82857143e+01 1.50000000e+05]
[2.80000000e+01 8.00000000e+04]]


X=km.cluster_centers_[:0]   #all rows in 0th column
Y=km.cluster_centers_[:1]              #all rows in 1st column

df1=df[df.Cluster==0]
df2=df[df.Cluster==1]
df3=df[df.Cluster==2]

#scatter plot the clusters
plt.scatter(df1.Age,df1['Income'],color='green',marker='o')
plt.scatter(df2.Age,df2['Income'],color='red',marker='d')
plt.scatter(df3.Age,df3['Income'],color='black',marker='s')

#scatter plot the cluster centers also
#plt.scatter(X,Y,color='purple',marker='*',label='centroid')

#plt.legend()

plt.xlabel('Age')
plt.ylabel('Income')

plt.show()
k-means-clustering-in-unsupervised-ml-img4
#Perofmr scaling opertaion using MinMaxScaler
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()

#scale up the Age column data
scaler.fit(df[['Age']])
df['Age']=scaler.transform(df[['Age']])

#scale up the Income column data
scaler.fit(df[['Income']])
df['Income']=scaler.transform(df[['Income']])

df.head()

Output:

k-means-clustering-in-unsupervised-ml-img5
from sklearn.cluster import KMeans
km=KMeans(n_clusters=3)

y_predicted=km.fit_predict(df[['Age','Income']])
print(y_predicted)

[0 0 0 0 2 2 2 2 1 1 1 0 0 0 0]


df['cluster']=y_predicted
df.head()

Output:

k-means-clustering-in-unsupervised-ml-img6
df.drop(columns='Cluster',axis=1,inplace=True)
df.head()

Output:

k-means-clustering-in-unsupervised-ml-img7
X=km.cluster_centers_[:0]      #all rows in 0th column
y=km.cluster_centers_[:1]                 #all rows in 1st column

df1=df[df.cluster==0]
df2=df[df.cluster==1]
df3=df[df.cluster==2]

#scatter plot the clusters
plt.scatter(df1.Age,df1['Income'],color='green',marker='o')
plt.scatter(df2.Age,df2['Income'],color='red',marker='d')
plt.scatter(df3.Age,df3['Income'],color='black',marker='s')

#scatter plot the cluster centers also
#plt.scatter(X,y,color='purple',marker='*',label='centroid')

#plt.legend()

plt.xlabel('Age')
plt.ylabel('Income')

plt.show()
k-means-clustering-in-unsupervised-ml-img8
#Elbow plot
sse=[] #empty list
k_rng=range(1,10) #k changes from 1 to 9

sse=[]
k_rng=range(1,10)

for k in k_rng:
    km=KMeans(n_clusters=k)
    km.fit(df[['Age','Income']])
    sse.append(km.inertia_)

#print(sse)

plt.xlabel('k')
plt.ylabel('sum of squared error')
plt.plot(k_rng,sse)

plt.show()

C:\Users\javab\anaconda3\lib\site-packages\sklearn\cluster\_kmeans.py: 1036: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.warnings.warn(


k-means-clustering-in-unsupervised-ml-img9

About the Author



Silan Software is one of the India's leading provider of offline & online training for Java, Python, AI (Machine Learning, Deep Learning), Data Science, Software Development & many more emerging Technologies.

We provide Academic Training || Industrial Training || Corporate Training || Internship || Java || Python || AI using Python || Data Science etc





 PreviousNext