import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore') # Suppress some warning messages


df = pd.read_csv("Real estate.csv", index_col = 0)


df.head()


df.describe()


df['X1 transaction date'] = np.floor(df['X1 transaction date']).astype('int').astype('category')


df['X1 transaction date'].value_counts(normalize = True)

2013    0.695652
2012    0.304348
Name: X1 transaction date, dtype: float64


df.groupby('X1 transaction date').agg([np.mean, np.min, np.max])


plt.figure()
sns.histplot(data = df, hue = 'X1 transaction date', x = 'Y house price of unit area', multiple = 'dodge')

<AxesSubplot:xlabel='Y house price of unit area', ylabel='Count'>


plt.figure()
sns.jointplot(data = df, x = 'X2 house age', y = 'Y house price of unit area', kind = 'hex')

<seaborn.axisgrid.JointGrid at 0x1ea2ba0a4c0>

<Figure size 432x288 with 0 Axes>


plt.figure()
sns.jointplot(data = df, x = 'X3 distance to the nearest MRT station', y = 'Y house price of unit area', kind = 'hex')

<seaborn.axisgrid.JointGrid at 0x1ea29d4ab50>

<Figure size 432x288 with 0 Axes>


plt.figure()
sns.jointplot(data = df, x = 'X4 number of convenience stores', y = 'Y house price of unit area', kind = 'hex')
# sns.histplot(data = df, hue = 'X4 number of convenience stores', x = 'Y house price of unit area', multiple = 'dodge')

<seaborn.axisgrid.JointGrid at 0x1ea2fe62280>

<Figure size 432x288 with 0 Axes>


tmp = df.pivot_table(columns = 'X5 latitude', index = 'X6 longitude', values = 'Y house price of unit area')


plt.figure()
sns.heatmap(tmp)

<AxesSubplot:xlabel='X5 latitude', ylabel='X6 longitude'>


# Aggregate by small grid
df_tmp = df[['X5 latitude', 'X6 longitude', 'Y house price of unit area']].copy()
df_tmp['X5 latitude'] = pd.cut(df_tmp['X5 latitude'], 50)
df_tmp['X6 longitude'] = pd.cut(df_tmp['X6 longitude'], 50)
df_tmp = df_tmp.groupby(['X5 latitude', 'X6 longitude']).agg(np.mean)
df_tmp = df_tmp.pivot_table(columns = 'X5 latitude', index = 'X6 longitude', values = 'Y house price of unit area')


plt.figure()
sns.heatmap(df_tmp)
# plt.xticks(ticks = [])
# plt.yticks(ticks = [])

<AxesSubplot:xlabel='X5 latitude', ylabel='X6 longitude'>


from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing


df_X = df[['X1 transaction date']].replace({2012: 0, 2013: 1})
df_X = df_X.join(df.iloc[:,1:6])
df_Y = df.iloc[:,6]


# 75-27 train-test split
X_train, X_test, y_train, y_test = train_test_split(df_X, df_Y, random_state = 0)


# Scale covariates
min_max_scaler = preprocessing.MinMaxScaler()
X_train.iloc[:,[1, 2, 4, 5]] = min_max_scaler.fit_transform(X_train.iloc[:,[1, 2, 4, 5]])
X_test.iloc[:,[1, 2, 4, 5]] = min_max_scaler.transform(X_test.iloc[:,[1, 2, 4, 5]])


X_train.head()


# Fit the linear model
lm = LinearRegression(fit_intercept = False)
lm.fit(X_train, y_train)

LinearRegression(fit_intercept=False)


lm.score(X_train, y_train)

0.5215627657658704


lm.score(X_test, y_test)

0.5122286680285608


# View the coefficients
np.vstack([X_train.columns, lm.coef_])

array([['X1 transaction date', 'X2 house age',
        'X3 distance to the nearest MRT station',
        'X4 number of convenience stores', 'X5 latitude', 'X6 longitude'],
       [3.8407022535237783, -9.4608344076313, 2.47460843883863,
        1.6086435367466345, 30.81042218155324, 28.33196215115027]],
      dtype=object)


fitted_train = lm.predict(X_train)
fitted_test = lm.predict(X_test)


plt.figure()
plt.scatter(y_train, fitted_train, label = 'Training', s = 5)
plt.scatter(y_test, fitted_test, label = 'Testing', s = 5)
plt.plot([0, 100], [0, 100], label = '45 Degree Line', color = 'black', linewidth = 2)
plt.legend(loc = 'upper left')
plt.xlabel('Actual Y')
plt.ylabel('Fitted Y')

Text(0, 0.5, 'Fitted Y')


from sklearn.linear_model import GammaRegressor
glm = GammaRegressor(fit_intercept = False)
glm.fit(X_train, y_train)

GammaRegressor(fit_intercept=False)


# View the coefficients
np.vstack([X_train.columns, glm.coef_])

array([['X1 transaction date', 'X2 house age',
        'X3 distance to the nearest MRT station',
        'X4 number of convenience stores', 'X5 latitude', 'X6 longitude'],
       [0.7270366912476239, 0.4743051127871534, 0.40651793362705,
        0.4034942619638806, 0.5638321145579172, 0.8110351064173907]],
      dtype=object)


glm.score(X_train, y_train)

-14.377904791234817


glm.score(X_test, y_test)

-20.270968944017753


fitted_train = glm.predict(X_train)
fitted_test = glm.predict(X_test)


plt.figure()
plt.scatter(y_train, fitted_train, label = 'Training', s = 5)
plt.scatter(y_test, fitted_test, label = 'Testing', s = 5)
plt.plot([0, 100], [0, 100], label = '45 Degree Line', color = 'black', linewidth = 2)
plt.legend(loc = 'upper left')
plt.xlabel('Actual Y')
plt.ylabel('Fitted Y')

Text(0, 0.5, 'Fitted Y')

	X1 transaction date	X2 house age	X3 distance to the nearest MRT station	X4 number of convenience stores	X5 latitude	X6 longitude	Y house price of unit area
No
1	2012.917	32.0	84.87882	10	24.98298	121.54024	37.9
2	2012.917	19.5	306.59470	9	24.98034	121.53951	42.2
3	2013.583	13.3	561.98450	5	24.98746	121.54391	47.3
4	2013.500	13.3	561.98450	5	24.98746	121.54391	54.8
5	2012.833	5.0	390.56840	5	24.97937	121.54245	43.1

	X2 house age	X3 distance to the nearest MRT station	X4 number of convenience stores	X5 latitude	X6 longitude	Y house price of unit area
count	414.000000	414.000000	414.000000	414.000000	414.000000	414.000000
mean	17.712560	1083.885689	4.094203	24.969030	121.533361	37.980193
std	11.392485	1262.109595	2.945562	0.012410	0.015347	13.606488
min	0.000000	23.382840	0.000000	24.932070	121.473530	7.600000
25%	9.025000	289.324800	1.000000	24.963000	121.528085	27.700000
50%	16.100000	492.231300	4.000000	24.971100	121.538630	38.450000
75%	28.150000	1454.279000	6.000000	24.977455	121.543305	46.600000
max	43.800000	6488.021000	10.000000	25.014590	121.566270	117.500000

	X2 house age			X3 distance to the nearest MRT station			X4 number of convenience stores			X5 latitude			X6 longitude			Y house price of unit area
	mean	amin	amax	mean	amin	amax	mean	amin	amax	mean	amin	amax	mean	amin	amax	mean	amin	amax
X1 transaction date
2012	16.866667	0.0	40.9	1052.404169	23.38284	6306.153	4.119048	0	10	24.968878	24.93885	24.99176	121.533056	121.47516	121.56627	36.304762	11.6	71.0
2013	18.082639	0.0	43.8	1097.658854	49.66105	6488.021	4.083333	0	10	24.969097	24.93207	25.01459	121.533495	121.47353	121.56627	38.713194	7.6	117.5

	X1 transaction date	X2 house age	X3 distance to the nearest MRT station	X4 number of convenience stores	X5 latitude	X6 longitude
No
323	1	0.294521	0.025384	1	0.506665	0.606858
209	0	0.262557	0.206780	1	0.242002	0.807526
57	1	0.767123	0.053811	8	0.490427	0.723097
9	1	0.723744	0.849027	1	0.228793	0.119150
313	1	0.808219	0.045656	9	0.468250	0.724175

Exploratory Analysis¶

Simple Linear Regression¶

Gamma GLM¶