# The usual setup
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats


# Read dataset
df = pd.read_csv("toy_dataset.csv", index_col = 0)


df.shape

(150000, 5)


df.head()


# Describe continuous variables
df.describe()


# Invalid data: negative income
df[df['Income']<0]
# Remove invalid
df = df[df['Income'] >= 0]


df.shape

(149999, 5)


# Plot Age
df['Age'].hist(bins = 100)
plt.title("Histogram of Age")

Text(0.5, 1.0, 'Histogram of Age')


# Plot Income
df['Income'].hist(bins = 100)
plt.title("Histogram of Income")

Text(0.5, 1.0, 'Histogram of Income')


# Transform Income
df['Income'] = np.log(df['Income'])
df['Income'].describe()
# Plotting
df['Income'].hist(bins = 100)
plt.title("Histogram of log(Income)")

Text(0.5, 1.0, 'Histogram of log(Income)')


# Convert discrete data in to factors
for x in ['City', 'Gender', 'Illness']:
    df[x] = df[x].astype('category')
    print(df[x].unique())

['Dallas', 'New York City', 'Los Angeles', 'Mountain View', 'Boston', 'Washington D.C.', 'San Diego', 'Austin']
Categories (8, object): ['Austin', 'Boston', 'Dallas', 'Los Angeles', 'Mountain View', 'New York City', 'San Diego', 'Washington D.C.']
['Male', 'Female']
Categories (2, object): ['Female', 'Male']
['No', 'Yes']
Categories (2, object): ['No', 'Yes']


# Plot City
sns.countplot(data = df, x = 'City')
plt.title('Bar Plot of City')
plt.xticks(rotation = 45)

(array([0, 1, 2, 3, 4, 5, 6, 7]),
 [Text(0, 0, 'Austin'),
  Text(1, 0, 'Boston'),
  Text(2, 0, 'Dallas'),
  Text(3, 0, 'Los Angeles'),
  Text(4, 0, 'Mountain View'),
  Text(5, 0, 'New York City'),
  Text(6, 0, 'San Diego'),
  Text(7, 0, 'Washington D.C.')])


# Plot Gender
sns.countplot(data = df, x = 'Gender')
plt.title('Bar Plot of Gender')

Text(0.5, 1.0, 'Bar Plot of Gender')


# Plot Illness
sns.countplot(data = df, x = 'Illness')
plt.title('Bar Plot of Illness')

Text(0.5, 1.0, 'Bar Plot of Illness')


# Summary statistics of Illness
df['Illness'].value_counts() / df.shape[0]

No     0.919079
Yes    0.080921
Name: Illness, dtype: float64


# Illness vs. Age
sns.histplot(df, x = 'Age', hue = 'Illness', bins = 100)
plt.title("Histogram of Age by Illness status")

Text(0.5, 1.0, 'Histogram of Age by Illness status')


sns.violinplot(data = df, x = 'Illness', y = 'Age')
plt.title("Violin Plot of Age by Illness status")

Text(0.5, 1.0, 'Violin Plot of Age by Illness status')


# Illness vs. log(Income)
sns.histplot(df, x = 'Income', hue = 'Illness', bins = 100)
plt.title("Histogram of log(Income) by Illness status")

Text(0.5, 1.0, 'Histogram of log(Income) by Illness status')


sns.violinplot(data = df, x = 'Illness', y = 'Income')
plt.title("Violin Plot of log(Income) by Illness status")

Text(0.5, 1.0, 'Violin Plot of log(Income) by Illness status')


# Illness vs. Gender
pd.crosstab(df['Gender'], df['Illness'], normalize = True, margins = True)


# A chi-squared test
stats.chi2_contingency(pd.crosstab(df['Gender'], df['Illness']))

(0.25259927807912536,
 0.6152507634973752,
 1,
 array([[60842.14120761,  5356.85879239],
        [77018.85879239,  6781.14120761]]))


# Illness vs. City
pd.crosstab(df['City'], df['Illness'], normalize = True, margins = True)


stats.chi2_contingency(pd.crosstab(df['City'], df['Illness']))

(2.927681297686849,
 0.8916107023609688,
 7,
 array([[11297.32472883,   994.67527117],
        [ 7629.27860186,   671.72139814],
        [18111.3798492 ,  1594.6201508 ],
        [29569.54348362,  2603.45651638],
        [13068.39084927,  1150.60915073],
        [46236.13042087,  4070.86957913],
        [ 4486.02684685,   394.97315315],
        [ 7462.9252195 ,   657.0747805 ]]))

	Age	Income
count	150000.000000	150000.000000
mean	44.950200	91252.798273
std	11.572486	24989.500948
min	25.000000	-654.000000
25%	35.000000	80867.750000
50%	45.000000	93655.000000
75%	55.000000	104519.000000
max	65.000000	177157.000000

Illness	No	Yes	All
Gender
Female	0.405796	0.035534	0.44133
Male	0.513283	0.045387	0.55867
All	0.919079	0.080921	1.00000

Illness	No	Yes	All
City
Austin	0.075207	0.006740	0.081947
Boston	0.050767	0.004573	0.055340
Dallas	0.120627	0.010747	0.131374
Los Angeles	0.197368	0.017120	0.214488
Mountain View	0.086941	0.007853	0.094794
New York City	0.308575	0.026807	0.335382
San Diego	0.029914	0.002627	0.032540
Washington D.C.	0.049680	0.004453	0.054134
All	0.919079	0.080921	1.000000

Data Loading and First Look¶

More Plotting¶

	City	Gender	Age	Income	Illness
Number
1	Dallas	Male	41	40367.0	No
2	Dallas	Male	54	45084.0	No
3	Dallas	Male	42	52483.0	No
4	Dallas	Male	40	40941.0	No
5	Dallas	Male	46	50289.0	No