import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score

plt.style.use('fivethirtyeight')
plt.rcParams["figure.figsize"] = (12, 10) 

from IPython.core.interactiveshell import InteractiveShell 
InteractiveShell.ast_node_interactivity = 'all'

#Step 1 Import the data
all = pd.read_csv('all528.csv')

# Display a numerical summary
summary_num = all.describe()
summary_num

# Display a summary of the DataFrame
summary = all.info()
summary

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 528 entries, 0 to 527
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   HEIGHT  528 non-null    float64
 1   LEAGUE  528 non-null    object 
dtypes: float64(1), object(1)
memory usage: 8.4+ KB

#Step 2 Display the shape and first few rows of the dataset
all.shape
all.head()

# Sort the data by LEAGUE and HEIGHT
all_sort = all.sort_values(by=['LEAGUE', 'HEIGHT'], axis=0, ascending=[False, False], inplace=False)

# Display the sorted data
all_sort

(528, 2)

#Step 3 Split the data
X_train, X_test, y_train, y_test = train_test_split(all[['HEIGHT']], all['LEAGUE'], test_size=0.3, random_state=20240923)

# Number of observations in each set
n_train = X_train.shape[0]
n_test = X_test.shape[0]

# Frequencies of WNBA and NBA in the training data
freq_train = y_train.value_counts()

print('Number of observations in the training data:',n_train)
print('Number of observations in the testing data:',n_test)
print('Frequencies in the training data:\n',freq_train)
#X_train; y_train
#X_test; y_test

Number of observations in the training data: 369
Number of observations in the testing data: 159
Frequencies in the training data:
 LEAGUE
NBA     250
WNBA    119
Name: count, dtype: int64

#Step 4 Scatter plot
plt.scatter(X_train[y_train == 'WNBA'], np.zeros(119), color='Plum', marker=2, s=500, label='WNBA',linewidth=.75)
plt.scatter(X_train[y_train == 'NBA'], np.zeros(250), color='Lime', marker=3, s=500,label='NBA',linewidth=.75)
plt.xlabel('Height')
plt.title('Training Data')
plt.grid(color = 'black')
plt.show();

#Step 5 Fit the KNN classifier
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)

# Predict on the training data
yhat = knn.predict(X_train)
predictions=knn.predict(X_train)

#Vectors of predictions 
WNBA=X_train[predictions=='WNBA']
NBA=X_train[predictions=='NBA']

# Scatter plot of predictions
plt.scatter(x=WNBA,y=np.zeros(len(WNBA)), color='plum', marker=2, s=500, label='WNBA',linewidth=.75)
plt.scatter(x=NBA,y= np.zeros(len(NBA)), color='lime', marker=3, s=500,label='NBA',linewidth=.75)
plt.xlabel('Height')
plt.grid(color = 'Black')
plt.title('Predictions on Training Data')
plt.show();

# Fit KNN with larger k values
knn = KNeighborsClassifier(n_neighbors=150)
knn.fit(X_train, y_train)

# Predict on the training data
yhat = knn.predict(X_train)
predictions=knn.predict(X_train)

#Vectors of predictions 
WNBA=X_train[predictions=='WNBA']
NBA=X_train[predictions=='NBA']

# Scatter plot of predictions
plt.scatter(x=WNBA, y= np.zeros(len(WNBA)), color='plum', marker=2, s=500, label='WNBA',linewidth=.75)
plt.scatter(x=NBA, y= np.zeros(len(NBA)), color='lime', marker=3, s=500,label='NBA',linewidth=.75)
plt.xlabel('Height')
plt.title('Predictions: k = 150')
plt.grid(color = 'black')
plt.show();

# Fit the KNN classifier - smallest k value to seperate the boundary
knn = KNeighborsClassifier(n_neighbors = 55)
knn.fit(X_train, y_train)

# Predict on the training data
yhat = knn.predict(X_train)
predictions=knn.predict(X_train)

#Vectors of predictions 
WNBA=X_train[predictions=='WNBA']
NBA=X_train[predictions=='NBA']

# Scatter plot of predictions
plt.scatter(x=WNBA, y= np.zeros(len(WNBA)), color='plum', marker=2, s=500,label='WNBA',linewidth=.75)
plt.scatter(x=NBA, y= np.zeros(len(NBA)), color='lime', marker=3, s=500,label='NBA',linewidth=.75)
plt.xlabel('Height')
plt.title('Predictions: k = 55')
plt.grid(color = 'black')
plt.show();

# use this code cell to display results for Problem 6
## Confusion matrix
cmat = confusion_matrix(y_train, yhat)
cmat
pd.DataFrame(cmat, columns = pd.MultiIndex.from_tuples([('Model','WNBA'),('Model','NBA')]), 
                    index=pd.MultiIndex.from_tuples([('Ground Truth','WNBA'),('Ground Truth','NBA')]))

array([[236,  14],
       [ 27,  92]], dtype=int64)

# Accuracy_score computes the accuracy of the model’s predictions 
accuracy = accuracy_score(y_train, yhat)
print('Training Accuracy',accuracy)

Training Accuracy 0.8888888888888888

# Fit KNN with smaller k value
# Fit the KNN classifier
knn = KNeighborsClassifier(n_neighbors = 55)
knn.fit(X_train, y_train)

# Predict on the training data
yhat = knn.predict(X_train)

## Confusion matrix
cmat = confusion_matrix(y_train, yhat)
cmat
pd.DataFrame(cmat, columns = pd.MultiIndex.from_tuples([('Model','WNBA'),('Model','NBA')]), 
                    index=pd.MultiIndex.from_tuples([('Ground Truth','WNBA'),('Ground Truth','NBA')]))


# Accuracy_score computes the accuracy of the model’s predictions 
accuracy = accuracy_score(y_train, yhat)
print('Training Accuracy for k = 55',accuracy)

KNeighborsClassifier(n_neighbors=55)

KNeighborsClassifier(n_neighbors=55)

array([[236,  14],
       [ 27,  92]], dtype=int64)

Training Accuracy for k = 55 0.8888888888888888

# use this code cell to display results for Problem 10
# Predict on the test data
yhat_pred = knn.predict(X_test)

# Confusion matrix for test data
cmat_test = confusion_matrix(y_test, yhat_pred, labels=['WNBA', 'NBA'])
pd.DataFrame(cmat_test, columns = pd.MultiIndex.from_tuples([('Model','WNBA'),('Model','NBA')]), 
                    index=pd.MultiIndex.from_tuples([('Ground Truth','WNBA'),('Ground Truth','NBA')]))

# Accuracy on test data
accuracy = accuracy_score(y_test, yhat_pred)
print('Test Accuracy', accuracy)

Test Accuracy 0.8679245283018868

	HEIGHT
count	528.000000
mean	75.923977
std	4.715112
min	61.470000
25%	72.895000
50%	76.550000
75%	79.432500
max	85.800000

	HEIGHT	LEAGUE
426	80.71	WNBA
457	80.56	WNBA
462	79.78	WNBA
491	79.74	WNBA
500	79.09	WNBA
...	...	...
25	71.59	NBA
254	71.56	NBA
83	71.55	NBA
340	71.53	NBA
258	71.27	NBA

HW KNN WNBA NBA¶

Big picture¶

Instructions¶

Code¶

Results and discussion¶

		Model
		WNBA	NBA
Ground Truth	WNBA	236	14
Ground Truth	NBA	27	92

		Model
		WNBA	NBA
Ground Truth	WNBA	236	14
Ground Truth	NBA	27	92

		Model
		WNBA	NBA
Ground Truth	WNBA	37	12
Ground Truth	NBA	9	101