Lab 7
# Step 1: Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
# Step 2: Load Data
# Replace 'your_file.csv' with your dataset file
data = pd.read_csv('your_file.csv')
# Preview the dataset
print(data.head())
# Step 3: Data Preprocessing
# Select relevant features for segmentation (example: income and spending score)
features = data[['Annual Income (k$)', 'Spending Score (1-100)']]
# Scale the data
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)
# Step 4: Determine the Optimal Number of Clusters
inertia = []
K = range(1, 11)
for k in K:
kmeans = KMeans(n_clusters=k, random_state=42)
kmeans.fit(scaled_features)
inertia.append(kmeans.inertia_)
# Plot the Elbow Curve
plt.figure(figsize=(8, 5))
plt.plot(K, inertia, 'bx-')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Inertia')
plt.title('The Elbow Method')
plt.show()
# Step 5: Apply K-Means Clustering
# Choose an appropriate K (e.g., from the elbow curve)
optimal_k = 4 # Example
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
clusters = kmeans.fit_predict(scaled_features)
# Add cluster labels to the original data
data['Cluster'] = clusters
# Step 6: Visualize the Results
plt.figure(figsize=(8, 6))
plt.scatter(scaled_features[:, 0], scaled_features[:, 1], c=clusters, cmap='viridis', alpha=0.6)
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], c='red', s=200, marker='X') # Cluster centers
plt.xlabel('Annual Income (scaled)')
plt.ylabel('Spending Score (scaled)')
plt.title('Customer Segments')
plt.show()
# Save the segmented data
data.to_csv('segmented_customers.csv', index=False)
Comments
Post a Comment