import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore') # Suppress some warning messages


# Read the dataset
df = pd.read_csv('toy_dataset.csv', index_col = 0)
df.shape

(150000, 5)


# Discard invalid data
df = df[df['Income']>=0]
df.shape

(149999, 5)


df.head()


from sklearn.model_selection import train_test_split
from sklearn import preprocessing


# Get all X and all y
df_X = df[['Age', 'Income']]
for var in ['City', 'Gender']:
    df_X = df_X.join(pd.get_dummies(df[var], prefix = var))
df_y = df['Illness'].replace({"Yes": 1, "No":0})


# 75-25 train-test split
X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, random_state = 0)


# Scale Age and Income using a min_max_scaler
min_max_scaler = preprocessing.MinMaxScaler()
min_max_scaler.fit(X_train[['Age', 'Income']])

MinMaxScaler()


# Transformed X_train
pd.set_option('mode.chained_assignment', None) # Avoid chainied assignment warning
X_train.loc[:,['Age', 'Income']] = min_max_scaler.transform(X_train[['Age', 'Income']])
X_train.head()


# Transformed X_test
X_test.loc[:,['Age', 'Income']] = min_max_scaler.transform(X_test[['Age', 'Income']])
X_test.head()


X_train[['Age', 'Income']].describe()


X_test[['Age', 'Income']].describe()


from sklearn.linear_model import LogisticRegression


lgm = LogisticRegression(fit_intercept = False, random_state = 0)
lgm.fit(X_train, y_train)

LogisticRegression(fit_intercept=False, random_state=0)


lgm.coef_

array([[ 0.01031286, -0.13642189, -0.47615061, -0.49543042, -0.49494307,
        -0.49159108, -0.41948432, -0.48699063, -0.4781675 , -0.42141711,
        -1.89137689, -1.87279785]])


prob_pred = lgm.predict_proba(X_test)
prob_pred

array([[0.91675214, 0.08324786],
       [0.91769189, 0.08230811],
       [0.91859242, 0.08140758],
       ...,
       [0.91650771, 0.08349229],
       [0.91996722, 0.08003278],
       [0.91605379, 0.08394621]])


ill_pred = lgm.predict(X_test)
ill_pred.max()

0


to_plot = pd.DataFrame({'Age': np.linspace(0, 1, 50), 'Fit': np.nan, 'Actual': np.nan})
for i in np.arange(0,50):
    # Predict
    X_tmp = X_train.copy()
    X_tmp['Age'] = to_plot['Age'][i]
    pred_tmp = lgm.predict_proba(X_tmp)
    to_plot.iloc[i,1] = np.mean(pred_tmp[:,1])
    # Actual
    prob_pred = lgm.predict_proba(X_train)
    to_plot.iloc[i,2] = np.mean(prob_pred[:,1][(X_train['Age']>=to_plot['Age'][i]-0.01) & (X_train['Age']<to_plot['Age'][i]+0.01)])


# Plotting
plt.figure()
plt.scatter(to_plot['Age'], to_plot['Fit'], label = 'Fitted')
plt.scatter(to_plot['Age'], to_plot['Actual'], label = 'Actual')
plt.xlabel('Age (Scaled)')
plt.ylabel('Pr(Illness)')
plt.title('Pr(Illness) against Age')

Text(0.5, 1.0, 'Pr(Illness) against Age')


to_plot = pd.DataFrame({'Income': np.linspace(0, 1, 50), 'Fit': np.nan, 'Actual': np.nan})
for i in np.arange(0,50):
    # Predict
    X_tmp = X_train.copy()
    X_tmp['Income'] = to_plot['Income'][i]
    pred_tmp = lgm.predict_proba(X_tmp)
    to_plot.iloc[i,1] = np.mean(pred_tmp[:,1])
    # Actual
    prob_pred = lgm.predict_proba(X_train)
    to_plot.iloc[i,2] = np.mean(prob_pred[:,1][(X_train['Income']>=to_plot['Income'][i]-0.01) & (X_train['Income']<to_plot['Income'][i]+0.01)])


# Plotting
plt.figure()
plt.scatter(to_plot['Income'], to_plot['Fit'], label = 'Fitted')
plt.scatter(to_plot['Income'], to_plot['Actual'], label = 'Actual')
plt.xlabel('Income (Scaled)')
plt.ylabel('Pr(Illness)')
plt.title('Pr(Illness) against Income')

Text(0.5, 1.0, 'Pr(Illness) against Income')


to_plot = pd.DataFrame({'City': df['City'].unique(), 'Fit': np.nan, 'Actual': np.nan})
to_plot.set_index('City')
for i in to_plot['City']:
    # Predict
    X_tmp = X_train.copy()
    X_tmp.iloc[:,2:9] = 0
    X_tmp['City_'+i] = 1
    pred_tmp = lgm.predict_proba(X_tmp)
    to_plot.iloc[to_plot['City']==i, 1] = np.mean(pred_tmp[:,1])
    # Actual
    prob_pred = lgm.predict_proba(X_train)
    to_plot.iloc[to_plot['City']==i, 2] = np.mean(prob_pred[:,1][X_train['City_'+i]==1])
to_plot = to_plot.melt(id_vars = 'City', var_name = 'Label', value_name = 'Prob')


# Plotting
plt.figure()
sns.barplot(data = to_plot, x = 'City', y = 'Prob', hue = 'Label')
plt.xlabel('City')
plt.ylabel('Pr(Illness)')
plt.title('Pr(Illness) against City')
plt.xticks(rotation = 45)
plt.legend(bbox_to_anchor=(1.04,1), loc = 'upper left')

<matplotlib.legend.Legend at 0x282b4865d30>


to_plot = pd.DataFrame({'Gender': df['Gender'].unique(), 'Fit': np.nan, 'Actual': np.nan})
to_plot.set_index('Gender')
for i in to_plot['Gender']:
    # Predict
    X_tmp = X_train.copy()
    X_tmp.iloc[:,10:11] = 0
    X_tmp['Gender_'+i] = 1
    pred_tmp = lgm.predict_proba(X_tmp)
    to_plot.iloc[to_plot['Gender']==i, 1] = np.mean(pred_tmp[:,1])
    # Actual
    prob_pred = lgm.predict_proba(X_train)
    to_plot.iloc[to_plot['Gender']==i, 2] = np.mean(prob_pred[:,1][X_train['Gender_'+i]==1])
to_plot = to_plot.melt(id_vars = 'Gender', var_name = 'Label', value_name = 'Prob')


# Plotting
plt.figure()
sns.barplot(data = to_plot, x = 'Gender', y = 'Prob', hue = 'Label')
plt.xlabel('City')
plt.ylabel('Pr(Illness)')
plt.title('Pr(Illness) against Gender')
plt.xticks(rotation = 45)
plt.legend(bbox_to_anchor=(1.04,1), loc = 'upper left')

<matplotlib.legend.Legend at 0x282ad30d280>


df['Gender'].value_counts(normalize = True)

Male      0.55867
Female    0.44133
Name: Gender, dtype: float64


from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=5)
neigh.fit(X_train, y_train)

KNeighborsClassifier()


prob_fit = neigh.predict_proba(X_train)
class_fit = neigh.predict(X_train)


prob_fit

array([[0.8, 0.2],
       [0.6, 0.4],
       [0.8, 0.2],
       ...,
       [1. , 0. ],
       [0.8, 0.2],
       [1. , 0. ]])


print(y_train.mean(), class_fit.mean())

0.08132516733482076 0.005075600672005973


neigh.score(X_test, y_test)

0.9163466666666666


knn_result = pd.DataFrame({'k': np.arange(1, 10), 'Train Score': np.nan, 'Test Score': np.nan})
for nk in knn_result['k']:
    neigh = KNeighborsClassifier(n_neighbors = nk)
    neigh.fit(X_train, y_train) 
    knn_result.iloc[knn_result['k']==nk, 1] = neigh.score(X_train, y_train)
    knn_result.iloc[knn_result['k']==nk, 2] = neigh.score(X_test, y_test)


to_plot = knn_result.melt(id_vars = 'k', var_name = 'Label', value_name = 'Prob')


# Plotting
plt.figure()
sns.lineplot(data = to_plot, x = 'k', y = 'Prob', hue = 'Label')
plt.xlabel('k')
plt.ylabel('Score')
plt.title('kNN Score vs k')
plt.legend(bbox_to_anchor=(1.04,1), loc = 'upper left')

<matplotlib.legend.Legend at 0x282b9de8f40>

	Age	Income	City_Austin	City_Boston	City_Dallas	City_Los Angeles	City_Mountain View	City_New York City	City_San Diego	City_Washington D.C.	Gender_Female	Gender_Male
Number
50780	0.100	0.447639	0	0	0	0	0	1	0	0	1	0
7357	0.225	0.420721	0	0	1	0	0	0	0	0	0	1
12236	0.100	0.271695	0	0	1	0	0	0	0	0	1	0
704	0.175	0.291307	0	0	1	0	0	0	0	0	0	1
934	0.675	0.246680	0	0	1	0	0	0	0	0	0	1

	Age	Income	City_Austin	City_Boston	City_Dallas	City_Los Angeles	City_Mountain View	City_New York City	City_San Diego	City_Washington D.C.	Gender_Female	Gender_Male
Number
141326	0.975	0.440702	1	0	0	0	0	0	0	0	0	1
14285	0.200	0.335091	0	0	1	0	0	0	0	0	0	1
114267	0.100	0.832302	0	0	0	0	1	0	0	0	1	0
112295	0.900	0.795439	0	0	0	0	1	0	0	0	1	0
32481	0.225	0.576979	0	0	0	0	0	1	0	0	0	1

	Age	Income
count	112499.000000	112499.000000
mean	0.498543	0.513393
std	0.289380	0.141669
min	0.000000	0.000000
25%	0.250000	0.454571
50%	0.500000	0.527159
75%	0.750000	0.588717
max	1.000000	1.000000

	Age	Income
count	37500.000000	37500.000000
mean	0.499395	0.513803
std	0.289116	0.141071
min	0.000000	0.034762
25%	0.250000	0.455038
50%	0.500000	0.526978
75%	0.750000	0.588414
max	1.000000	0.958578

Revisiting the Dataset & Preprocessing¶

Logistic Regression¶

kNN Classification¶

	City	Gender	Age	Income	Illness
Number
1	Dallas	Male	41	40367.0	No
2	Dallas	Male	54	45084.0	No
3	Dallas	Male	42	52483.0	No
4	Dallas	Male	40	40941.0	No
5	Dallas	Male	46	50289.0	No