In [62]:
Copied!
import time
# data analysis and preparing
import pandas as pd
import math
import numpy as np
import random as rnd
# visualization
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import scipy.stats as stats
# tqdm.pandas()
# machine learning
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics, tree
from sklearn.metrics import auc, roc_curve, confusion_matrix, f1_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')
import time
# data analysis and preparing
import pandas as pd
import math
import numpy as np
import random as rnd
# visualization
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import scipy.stats as stats
# tqdm.pandas()
# machine learning
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics, tree
from sklearn.metrics import auc, roc_curve, confusion_matrix, f1_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')
In [63]:
Copied!
train_df = pd.read_csv("HW1/train.csv")
test_df = pd.read_csv("HW1/test.csv")
train_df
train_df = pd.read_csv("HW1/train.csv")
test_df = pd.read_csv("HW1/test.csv")
train_df
| Id | MSSubClass | MSZoning | LotFrontage | LotArea | Street | Alley | LotShape | LandContour | Utilities | ... | PoolArea | PoolQC | Fence | MiscFeature | MiscVal | MoSold | YrSold | SaleType | SaleCondition | SalePrice | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 60 | RL | 65.0 | 8450 | Pave | NaN | Reg | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 2 | 2008 | WD | Normal | 208500 |
| 1 | 2 | 20 | RL | 80.0 | 9600 | Pave | NaN | Reg | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 5 | 2007 | WD | Normal | 181500 |
| 2 | 3 | 60 | RL | 68.0 | 11250 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 9 | 2008 | WD | Normal | 223500 |
| 3 | 4 | 70 | RL | 60.0 | 9550 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 2 | 2006 | WD | Abnorml | 140000 |
| 4 | 5 | 60 | RL | 84.0 | 14260 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 12 | 2008 | WD | Normal | 250000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1455 | 1456 | 60 | RL | 62.0 | 7917 | Pave | NaN | Reg | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 8 | 2007 | WD | Normal | 175000 |
| 1456 | 1457 | 20 | RL | 85.0 | 13175 | Pave | NaN | Reg | Lvl | AllPub | ... | 0 | NaN | MnPrv | NaN | 0 | 2 | 2010 | WD | Normal | 210000 |
| 1457 | 1458 | 70 | RL | 66.0 | 9042 | Pave | NaN | Reg | Lvl | AllPub | ... | 0 | NaN | GdPrv | Shed | 2500 | 5 | 2010 | WD | Normal | 266500 |
| 1458 | 1459 | 20 | RL | 68.0 | 9717 | Pave | NaN | Reg | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 4 | 2010 | WD | Normal | 142125 |
| 1459 | 1460 | 20 | RL | 75.0 | 9937 | Pave | NaN | Reg | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 6 | 2008 | WD | Normal | 147500 |
1460 rows × 81 columns
In [64]:
Copied!
from sklearn.model_selection import train_test_split
X = train_df.drop(columns=['SalePrice']).copy()
y = train_df['SalePrice']
from sklearn.model_selection import train_test_split
X = train_df.drop(columns=['SalePrice']).copy()
y = train_df['SalePrice']
In [65]:
Copied!
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8)
In [66]:
Copied!
plt.figure(figsize=(18, 5)) #adjust the size of plot
sns.histplot(train_df['LotFrontage'], stat='frequency').set(title='LotFrontage Histogram (Continuous)')
plt.figure(figsize=(18, 5)) #adjust the size of plot
sns.histplot(train_df['LotFrontage'], stat='frequency').set(title='LotFrontage Histogram (Continuous)')
[Text(0.5, 1.0, 'LotFrontage Histogram (Continuous)')]
In [67]:
Copied!
#plt.figure(figsize=(18, 5)) #adjust the size of plot
sns.histplot(train_df['MSZoning'], stat='frequency').set(title='MSZoning Histogram (Categorical)')
#plt.figure(figsize=(18, 5)) #adjust the size of plot
sns.histplot(train_df['MSZoning'], stat='frequency').set(title='MSZoning Histogram (Categorical)')
[Text(0.5, 1.0, 'MSZoning Histogram (Categorical)')]
In [68]:
Copied!
plt.figure(figsize=(18, 5)) #adjust the size of plot
sns.histplot(x='YearRemodAdd', hue="OverallQual", data=train_df, bins=20, stat='count', multiple='stack')
plt.figure(figsize=(18, 5)) #adjust the size of plot
sns.histplot(x='YearRemodAdd', hue="OverallQual", data=train_df, bins=20, stat='count', multiple='stack')
<Axes: xlabel='YearRemodAdd', ylabel='Count'>
Pre-processing¶
In [69]:
Copied!
train_df.info()
train_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1460 entries, 0 to 1459 Data columns (total 81 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Id 1460 non-null int64 1 MSSubClass 1460 non-null int64 2 MSZoning 1460 non-null object 3 LotFrontage 1201 non-null float64 4 LotArea 1460 non-null int64 5 Street 1460 non-null object 6 Alley 91 non-null object 7 LotShape 1460 non-null object 8 LandContour 1460 non-null object 9 Utilities 1460 non-null object 10 LotConfig 1460 non-null object 11 LandSlope 1460 non-null object 12 Neighborhood 1460 non-null object 13 Condition1 1460 non-null object 14 Condition2 1460 non-null object 15 BldgType 1460 non-null object 16 HouseStyle 1460 non-null object 17 OverallQual 1460 non-null int64 18 OverallCond 1460 non-null int64 19 YearBuilt 1460 non-null int64 20 YearRemodAdd 1460 non-null int64 21 RoofStyle 1460 non-null object 22 RoofMatl 1460 non-null object 23 Exterior1st 1460 non-null object 24 Exterior2nd 1460 non-null object 25 MasVnrType 1452 non-null object 26 MasVnrArea 1452 non-null float64 27 ExterQual 1460 non-null object 28 ExterCond 1460 non-null object 29 Foundation 1460 non-null object 30 BsmtQual 1423 non-null object 31 BsmtCond 1423 non-null object 32 BsmtExposure 1422 non-null object 33 BsmtFinType1 1423 non-null object 34 BsmtFinSF1 1460 non-null int64 35 BsmtFinType2 1422 non-null object 36 BsmtFinSF2 1460 non-null int64 37 BsmtUnfSF 1460 non-null int64 38 TotalBsmtSF 1460 non-null int64 39 Heating 1460 non-null object 40 HeatingQC 1460 non-null object 41 CentralAir 1460 non-null object 42 Electrical 1459 non-null object 43 1stFlrSF 1460 non-null int64 44 2ndFlrSF 1460 non-null int64 45 LowQualFinSF 1460 non-null int64 46 GrLivArea 1460 non-null int64 47 BsmtFullBath 1460 non-null int64 48 BsmtHalfBath 1460 non-null int64 49 FullBath 1460 non-null int64 50 HalfBath 1460 non-null int64 51 BedroomAbvGr 1460 non-null int64 52 KitchenAbvGr 1460 non-null int64 53 KitchenQual 1460 non-null object 54 TotRmsAbvGrd 1460 non-null int64 55 Functional 1460 non-null object 56 Fireplaces 1460 non-null int64 57 FireplaceQu 770 non-null object 58 GarageType 1379 non-null object 59 GarageYrBlt 1379 non-null float64 60 GarageFinish 1379 non-null object 61 GarageCars 1460 non-null int64 62 GarageArea 1460 non-null int64 63 GarageQual 1379 non-null object 64 GarageCond 1379 non-null object 65 PavedDrive 1460 non-null object 66 WoodDeckSF 1460 non-null int64 67 OpenPorchSF 1460 non-null int64 68 EnclosedPorch 1460 non-null int64 69 3SsnPorch 1460 non-null int64 70 ScreenPorch 1460 non-null int64 71 PoolArea 1460 non-null int64 72 PoolQC 7 non-null object 73 Fence 281 non-null object 74 MiscFeature 54 non-null object 75 MiscVal 1460 non-null int64 76 MoSold 1460 non-null int64 77 YrSold 1460 non-null int64 78 SaleType 1460 non-null object 79 SaleCondition 1460 non-null object 80 SalePrice 1460 non-null int64 dtypes: float64(3), int64(35), object(43) memory usage: 924.0+ KB
In [70]:
Copied!
train_df.describe(percentiles=[.1, .2, .6, .7, .8, .9, .95, .99])
train_df.describe(percentiles=[.1, .2, .6, .7, .8, .9, .95, .99])
| Id | MSSubClass | LotFrontage | LotArea | OverallQual | OverallCond | YearBuilt | YearRemodAdd | MasVnrArea | BsmtFinSF1 | ... | WoodDeckSF | OpenPorchSF | EnclosedPorch | 3SsnPorch | ScreenPorch | PoolArea | MiscVal | MoSold | YrSold | SalePrice | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1460.000000 | 1460.000000 | 1201.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1452.000000 | 1460.000000 | ... | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 |
| mean | 730.500000 | 56.897260 | 70.049958 | 10516.828082 | 6.099315 | 5.575342 | 1971.267808 | 1984.865753 | 103.685262 | 443.639726 | ... | 94.244521 | 46.660274 | 21.954110 | 3.409589 | 15.060959 | 2.758904 | 43.489041 | 6.321918 | 2007.815753 | 180921.195890 |
| std | 421.610009 | 42.300571 | 24.284752 | 9981.264932 | 1.382997 | 1.112799 | 30.202904 | 20.645407 | 181.066207 | 456.098091 | ... | 125.338794 | 66.256028 | 61.119149 | 29.317331 | 55.757415 | 40.177307 | 496.123024 | 2.703626 | 1.328095 | 79442.502883 |
| min | 1.000000 | 20.000000 | 21.000000 | 1300.000000 | 1.000000 | 1.000000 | 1872.000000 | 1950.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 2006.000000 | 34900.000000 |
| 10% | 146.900000 | 20.000000 | 44.000000 | 5000.000000 | 5.000000 | 5.000000 | 1924.900000 | 1950.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 3.000000 | 2006.000000 | 106475.000000 |
| 20% | 292.800000 | 20.000000 | 53.000000 | 7078.400000 | 5.000000 | 5.000000 | 1947.800000 | 1961.800000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 4.000000 | 2006.000000 | 124000.000000 |
| 50% | 730.500000 | 50.000000 | 69.000000 | 9478.500000 | 6.000000 | 5.000000 | 1973.000000 | 1994.000000 | 0.000000 | 383.500000 | ... | 0.000000 | 25.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 6.000000 | 2008.000000 | 163000.000000 |
| 60% | 876.400000 | 60.000000 | 74.000000 | 10198.200000 | 6.000000 | 5.000000 | 1984.000000 | 1998.000000 | 16.000000 | 525.600000 | ... | 100.000000 | 40.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 7.000000 | 2008.000000 | 179280.000000 |
| 70% | 1022.300000 | 60.000000 | 79.000000 | 11066.500000 | 7.000000 | 6.000000 | 1997.300000 | 2002.000000 | 117.000000 | 655.000000 | ... | 144.000000 | 57.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 7.000000 | 2009.000000 | 198620.000000 |
| 80% | 1168.200000 | 80.000000 | 85.000000 | 12205.800000 | 7.000000 | 7.000000 | 2003.000000 | 2005.000000 | 206.000000 | 806.400000 | ... | 192.000000 | 83.200000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 8.000000 | 2009.000000 | 230000.000000 |
| 90% | 1314.100000 | 120.000000 | 96.000000 | 14381.700000 | 8.000000 | 7.000000 | 2006.000000 | 2006.000000 | 335.000000 | 1065.500000 | ... | 262.000000 | 130.000000 | 112.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 10.000000 | 2010.000000 | 278000.000000 |
| 95% | 1387.050000 | 160.000000 | 107.000000 | 17401.150000 | 8.000000 | 8.000000 | 2007.000000 | 2007.000000 | 456.000000 | 1274.000000 | ... | 335.000000 | 175.050000 | 180.150000 | 0.000000 | 160.000000 | 0.000000 | 0.000000 | 11.000000 | 2010.000000 | 326100.000000 |
| 99% | 1445.410000 | 190.000000 | 141.000000 | 37567.640000 | 10.000000 | 9.000000 | 2009.000000 | 2009.000000 | 791.920000 | 1572.410000 | ... | 505.460000 | 285.820000 | 261.050000 | 168.000000 | 268.050000 | 0.000000 | 700.000000 | 12.000000 | 2010.000000 | 442567.010000 |
| max | 1460.000000 | 190.000000 | 313.000000 | 215245.000000 | 10.000000 | 9.000000 | 2010.000000 | 2010.000000 | 1600.000000 | 5644.000000 | ... | 857.000000 | 547.000000 | 552.000000 | 508.000000 | 480.000000 | 738.000000 | 15500.000000 | 12.000000 | 2010.000000 | 755000.000000 |
14 rows × 38 columns
In [71]:
Copied!
# Creating a new empty dataframe
missing_df = pd.DataFrame()
missing_df["Feature"] = X_train.columns
# Calculating the percentage of the missing values for each attribute
missing = ((X_train.isnull().sum() / len(X_train)) * 100).values
missing_df["Missing"] = missing
missing_df = missing_df[missing_df["Feature"] != "SalePrice"]
missing_df = missing_df[missing_df["Missing"] != 0]
missing_df = missing_df.sort_values(by="Missing", ascending=False)
plt.figure(figsize=(12, 5))
g = sns.barplot(data=missing_df, x="Feature", y="Missing", color="blue")
g.set_xticklabels(g.get_xticklabels(), rotation=30);
# Creating a new empty dataframe
missing_df = pd.DataFrame()
missing_df["Feature"] = X_train.columns
# Calculating the percentage of the missing values for each attribute
missing = ((X_train.isnull().sum() / len(X_train)) * 100).values
missing_df["Missing"] = missing
missing_df = missing_df[missing_df["Feature"] != "SalePrice"]
missing_df = missing_df[missing_df["Missing"] != 0]
missing_df = missing_df.sort_values(by="Missing", ascending=False)
plt.figure(figsize=(12, 5))
g = sns.barplot(data=missing_df, x="Feature", y="Missing", color="blue")
g.set_xticklabels(g.get_xticklabels(), rotation=30);
In [72]:
Copied!
def missing(df):
attributes = df.loc[df['Missing'] > 50]
return list(attributes['Feature'])
to_remove = missing(missing_df)
def remove_missing(df, to_remove):
return df.drop(columns=to_remove)
def missing(df):
attributes = df.loc[df['Missing'] > 50]
return list(attributes['Feature'])
to_remove = missing(missing_df)
def remove_missing(df, to_remove):
return df.drop(columns=to_remove)
In [73]:
Copied!
numerical_attr = X_train.dtypes[X_train.dtypes != "object"].index
categorical_attr = X_train.dtypes[X_train.dtypes == "object"].index
categorical_attr
numerical_attr = X_train.dtypes[X_train.dtypes != "object"].index
categorical_attr = X_train.dtypes[X_train.dtypes == "object"].index
categorical_attr
Index(['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature',
'SaleType', 'SaleCondition'],
dtype='object')
In [74]:
Copied!
fig = pd.melt(X_train, value_vars=numerical_attr)
a1 = sns.FacetGrid(fig, col="variable", col_wrap=8, sharex=False, sharey=False, height=2, palette="Set1")
a1 = a1.map(sns.histplot, "value", color='red')
plt.show()
fig = pd.melt(X_train, value_vars=numerical_attr)
a1 = sns.FacetGrid(fig, col="variable", col_wrap=8, sharex=False, sharey=False, height=2, palette="Set1")
a1 = a1.map(sns.histplot, "value", color='red')
plt.show()
In [75]:
Copied!
fig = pd.melt(X_train, value_vars=categorical_attr)
a1 = sns.FacetGrid(fig, col="variable", col_wrap=4, sharex=False, sharey=False, height=5, palette="Set3")
a1 = a1.map(sns.countplot, "value", color="red")
a1.set_xticklabels(rotation=45)
plt.show()
df_numerical = X_train[numerical_attr]
df_categorical = X_train[categorical_attr]
fig = pd.melt(X_train, value_vars=categorical_attr)
a1 = sns.FacetGrid(fig, col="variable", col_wrap=4, sharex=False, sharey=False, height=5, palette="Set3")
a1 = a1.map(sns.countplot, "value", color="red")
a1.set_xticklabels(rotation=45)
plt.show()
df_numerical = X_train[numerical_attr]
df_categorical = X_train[categorical_attr]
In [76]:
Copied!
to_normalise = ["MSSubClass", "BsmtFinSF1", "BsmtUnfSF", "TotalBsmtSF", "1stFlrSF", "2ndFlrSF", "LowQualFinSF",
"GrLivArea", "GarageArea", "WoodDeckSF", "OpenPorchSF", "EnclosedPorch", "3SsnPorch", "ScreenPorch",
"PoolArea"]
other_numerical = list(set(numerical_attr) - set(to_normalise))
numerical_norm = df_numerical[to_normalise]
numerical_rest = df_numerical[other_numerical]
to_normalise = ["MSSubClass", "BsmtFinSF1", "BsmtUnfSF", "TotalBsmtSF", "1stFlrSF", "2ndFlrSF", "LowQualFinSF",
"GrLivArea", "GarageArea", "WoodDeckSF", "OpenPorchSF", "EnclosedPorch", "3SsnPorch", "ScreenPorch",
"PoolArea"]
other_numerical = list(set(numerical_attr) - set(to_normalise))
numerical_norm = df_numerical[to_normalise]
numerical_rest = df_numerical[other_numerical]
Using SimpleImputer to replace the missing values with the attributes' means¶
In [77]:
Copied!
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
numerical_norm = imputer.fit_transform(numerical_norm)
numerical_rest = imputer.fit_transform(numerical_rest)
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
numerical_norm = imputer.fit_transform(numerical_norm)
numerical_rest = imputer.fit_transform(numerical_rest)
To normalise the numerical attributes, we use StandardScaler¶
Scaling and Structuring Numerical Data:
- Initialize the
StandardScalerto standardize the features by removing the mean and scaling to unit variance. - Apply the scaler to the
numerical_normdata to normalize its values. - Convert the normalized array back into a dataframe (
numerical_norm) with appropriate column names sourced fromto_normalise. - Similarly, structure the
numerical_restdata into a dataframe with column names sourced fromother_numerical.
In [78]:
Copied!
scaler = StandardScaler()
numerical_norm = scaler.fit_transform(numerical_norm)
numerical_norm = pd.DataFrame(numerical_norm, columns=to_normalise)
numerical_rest = pd.DataFrame(numerical_rest, columns=other_numerical)
scaler = StandardScaler()
numerical_norm = scaler.fit_transform(numerical_norm)
numerical_norm = pd.DataFrame(numerical_norm, columns=to_normalise)
numerical_rest = pd.DataFrame(numerical_rest, columns=other_numerical)
In [79]:
Copied!
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imputer = imputer.fit(df_categorical)
df_categorical = imputer.transform(df_categorical)
df_categorical = pd.DataFrame(df_categorical, columns=categorical_attr)
df_categorical
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imputer = imputer.fit(df_categorical)
df_categorical = imputer.transform(df_categorical)
df_categorical = pd.DataFrame(df_categorical, columns=categorical_attr)
df_categorical
| MSZoning | Street | Alley | LotShape | LandContour | Utilities | LotConfig | LandSlope | Neighborhood | Condition1 | ... | GarageType | GarageFinish | GarageQual | GarageCond | PavedDrive | PoolQC | Fence | MiscFeature | SaleType | SaleCondition | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | RM | Pave | Grvl | Reg | Lvl | AllPub | FR2 | Gtl | BrkSide | Feedr | ... | Detchd | Unf | TA | TA | Y | Gd | MnPrv | Shed | WD | Normal |
| 1 | RM | Pave | Grvl | Reg | Lvl | AllPub | Inside | Gtl | IDOTRR | Norm | ... | Detchd | Unf | TA | TA | Y | Gd | MnPrv | Shed | WD | Abnorml |
| 2 | RL | Pave | Grvl | IR1 | Lvl | AllPub | Inside | Gtl | CollgCr | Norm | ... | Attchd | RFn | TA | TA | Y | Gd | MnPrv | Shed | New | Partial |
| 3 | RL | Pave | Grvl | Reg | Lvl | AllPub | FR2 | Gtl | NAmes | Norm | ... | Attchd | Unf | TA | TA | Y | Gd | MnPrv | Shed | ConLI | Normal |
| 4 | RL | Pave | Grvl | Reg | Lvl | AllPub | Inside | Gtl | Gilbert | RRAn | ... | Attchd | Fin | TA | TA | Y | Gd | MnPrv | Shed | New | Partial |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1163 | RL | Pave | Grvl | IR1 | Lvl | AllPub | Inside | Gtl | BrkSide | Norm | ... | Attchd | Unf | TA | TA | Y | Gd | GdWo | Shed | WD | Normal |
| 1164 | FV | Pave | Grvl | Reg | Lvl | AllPub | Inside | Gtl | Somerst | Norm | ... | Attchd | Fin | TA | TA | Y | Gd | MnPrv | Shed | WD | Normal |
| 1165 | RL | Pave | Grvl | Reg | Lvl | AllPub | Inside | Gtl | Blmngtn | Norm | ... | Attchd | Fin | TA | TA | Y | Gd | MnPrv | Shed | New | Partial |
| 1166 | RL | Pave | Grvl | Reg | Lvl | AllPub | Inside | Gtl | CollgCr | Norm | ... | Attchd | RFn | TA | TA | Y | Gd | MnPrv | Shed | WD | Normal |
| 1167 | RL | Pave | Grvl | Reg | Lvl | AllPub | Corner | Gtl | NWAmes | Norm | ... | Attchd | RFn | TA | TA | Y | Gd | MnPrv | Shed | WD | Normal |
1168 rows × 43 columns
Merging the normalised numerical dataframe with the untouched features¶
In [80]:
Copied!
df_numerical = numerical_norm.join(numerical_rest, how="inner")
# Merging the categorical and numerical dataframes
merged = df_numerical.join(df_categorical, how="inner")
merged
df_numerical = numerical_norm.join(numerical_rest, how="inner")
# Merging the categorical and numerical dataframes
merged = df_numerical.join(df_categorical, how="inner")
merged
| MSSubClass | BsmtFinSF1 | BsmtUnfSF | TotalBsmtSF | 1stFlrSF | 2ndFlrSF | LowQualFinSF | GrLivArea | GarageArea | WoodDeckSF | ... | GarageType | GarageFinish | GarageQual | GarageCond | PavedDrive | PoolQC | Fence | MiscFeature | SaleType | SaleCondition | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -0.643145 | -0.093430 | 0.395032 | 0.187377 | 0.346239 | -0.796237 | -0.108737 | -0.410118 | -0.147304 | -0.729387 | ... | Detchd | Unf | TA | TA | Y | Gd | MnPrv | Shed | WD | Normal |
| 1 | -0.643145 | -0.352488 | 0.225600 | -0.251054 | -0.555627 | -0.796237 | -0.108737 | -1.068973 | -0.668893 | -0.729387 | ... | Detchd | Unf | TA | TA | Y | Gd | MnPrv | Shed | WD | Abnorml |
| 2 | -0.882150 | -0.967750 | 2.130564 | 0.985141 | 0.853861 | -0.796237 | -0.108737 | -0.039277 | 0.937417 | 2.069742 | ... | Attchd | RFn | TA | TA | Y | Gd | MnPrv | Shed | New | Partial |
| 3 | -0.882150 | -0.035142 | -0.310171 | -0.447671 | -0.779805 | -0.796237 | -0.108737 | -1.232746 | -2.178271 | -0.729387 | ... | Attchd | Unf | TA | TA | Y | Gd | MnPrv | Shed | ConLI | Normal |
| 4 | -0.882150 | -0.907303 | 1.649744 | 0.573830 | 0.384890 | -0.796237 | -0.108737 | -0.381881 | -0.345784 | 0.068086 | ... | Attchd | Fin | TA | TA | Y | Gd | MnPrv | Shed | New | Partial |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1163 | -0.643145 | -0.967750 | 0.367557 | -0.755025 | 0.359123 | -0.796237 | -0.108737 | -0.400706 | -2.178271 | 1.367966 | ... | Attchd | Unf | TA | TA | Y | Gd | GdWo | Shed | WD | Normal |
| 1164 | 0.073871 | 0.683744 | -0.722303 | -0.101897 | -0.362370 | 1.452250 | -0.108737 | 0.918887 | 1.569787 | 0.610367 | ... | Attchd | Fin | TA | TA | Y | Gd | MnPrv | Shed | WD | Normal |
| 1165 | 1.507902 | -0.933209 | 1.521525 | 0.420153 | 0.209671 | -0.796237 | -0.108737 | -0.509887 | -0.156535 | 0.131883 | ... | Attchd | Fin | TA | TA | Y | Gd | MnPrv | Shed | New | Partial |
| 1166 | -0.882150 | 0.569327 | 0.443114 | 0.928643 | 0.817786 | -0.796237 | -0.108737 | -0.065631 | 0.618925 | -0.729387 | ... | Attchd | RFn | TA | TA | Y | Gd | MnPrv | Shed | WD | Normal |
| 1167 | 0.073871 | -0.095589 | -0.603242 | -0.800224 | -1.181780 | 0.826524 | -0.108737 | -0.193637 | 1.403617 | -0.729387 | ... | Attchd | RFn | TA | TA | Y | Gd | MnPrv | Shed | WD | Normal |
1168 rows × 80 columns
In [81]:
Copied!
corr = train_df.corr()
corr = train_df.corr()
Feature Selection Using ANOVA F-test:
- Import necessary libraries for feature selection.
- Use the numerical data (
df_numerical) and the training target (y_train) for feature selection. - The
SelectKBestmethod with ANOVA F-test (f_classif) is used to determine the top 10 most important features. - The scores for each feature are stored in the
dfscoresdataframe and the corresponding column names in thedfcolumnsdataframe. - These two dataframes are then concatenated for a combined view, resulting in the
featureScoresdataframe which has columns 'Specs' (feature names) and 'Score' (importance scores).
In [82]:
Copied!
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import SelectKBest
data = df_numerical
target = y_train
best_features = SelectKBest(score_func=f_classif, k=10).fit(data, target)
dfscores = pd.DataFrame(best_features.scores_)
dfcolumns = pd.DataFrame(data.columns)
#concat two dataframes for better visualization
featureScores = pd.concat([dfcolumns, dfscores], axis=1)
featureScores.columns = ['Specs', 'Score'] #naming the dataframe columns
featureScores
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import SelectKBest
data = df_numerical
target = y_train
best_features = SelectKBest(score_func=f_classif, k=10).fit(data, target)
dfscores = pd.DataFrame(best_features.scores_)
dfcolumns = pd.DataFrame(data.columns)
#concat two dataframes for better visualization
featureScores = pd.concat([dfcolumns, dfscores], axis=1)
featureScores.columns = ['Specs', 'Score'] #naming the dataframe columns
featureScores
| Specs | Score | |
|---|---|---|
| 0 | MSSubClass | 0.824160 |
| 1 | BsmtFinSF1 | 1.596695 |
| 2 | BsmtUnfSF | 1.434095 |
| 3 | TotalBsmtSF | 2.032340 |
| 4 | 1stFlrSF | 1.960985 |
| 5 | 2ndFlrSF | 1.607515 |
| 6 | LowQualFinSF | 1.073885 |
| 7 | GrLivArea | 3.130788 |
| 8 | GarageArea | 2.467893 |
| 9 | WoodDeckSF | 1.400941 |
| 10 | OpenPorchSF | 1.314583 |
| 11 | EnclosedPorch | 0.820725 |
| 12 | 3SsnPorch | 1.452036 |
| 13 | ScreenPorch | 0.974432 |
| 14 | PoolArea | 0.897001 |
| 15 | BsmtHalfBath | 0.849346 |
| 16 | BedroomAbvGr | 1.272684 |
| 17 | Id | 0.915745 |
| 18 | MasVnrArea | 2.141524 |
| 19 | TotRmsAbvGrd | 1.823866 |
| 20 | LotFrontage | 1.056271 |
| 21 | Fireplaces | 1.470682 |
| 22 | YrSold | 1.048456 |
| 23 | FullBath | 2.600820 |
| 24 | MiscVal | 3.395764 |
| 25 | BsmtFinSF2 | 1.020022 |
| 26 | OverallQual | 5.210644 |
| 27 | HalfBath | 1.402837 |
| 28 | KitchenAbvGr | 0.802878 |
| 29 | LotArea | 3.596742 |
| 30 | YearRemodAdd | 1.635899 |
| 31 | MoSold | 0.886785 |
| 32 | BsmtFullBath | 1.141296 |
| 33 | OverallCond | 1.261440 |
| 34 | GarageYrBlt | 1.678893 |
| 35 | GarageCars | 2.832141 |
| 36 | YearBuilt | 2.210611 |
Visualizing Feature Correlations with a Heatmap:
- First, compute the correlation matrix
corr_matfor thetrain_dfdataframe. - Set a large figure size to ensure clarity in the heatmap visualization.
- Use Seaborn's
heatmapfunction to visualize the correlations among features, with annotations for exact values. The colormap "RdYlGn" represents a gradient from red (negative correlation) through yellow (no correlation) to green (positive correlation).
In [83]:
Copied!
corr_mat = train_df.corr()
plt.figure(figsize=(30, 20))
g = sns.heatmap(corr_mat, annot=True, cmap="RdYlGn")
corr_mat = train_df.corr()
plt.figure(figsize=(30, 20))
g = sns.heatmap(corr_mat, annot=True, cmap="RdYlGn")
In [84]:
Copied!
correlations = corr['SalePrice'].sort_values(ascending=False).to_frame()
correlations
correlations = corr['SalePrice'].sort_values(ascending=False).to_frame()
correlations
| SalePrice | |
|---|---|
| SalePrice | 1.000000 |
| OverallQual | 0.790982 |
| GrLivArea | 0.708624 |
| GarageCars | 0.640409 |
| GarageArea | 0.623431 |
| TotalBsmtSF | 0.613581 |
| 1stFlrSF | 0.605852 |
| FullBath | 0.560664 |
| TotRmsAbvGrd | 0.533723 |
| YearBuilt | 0.522897 |
| YearRemodAdd | 0.507101 |
| GarageYrBlt | 0.486362 |
| MasVnrArea | 0.477493 |
| Fireplaces | 0.466929 |
| BsmtFinSF1 | 0.386420 |
| LotFrontage | 0.351799 |
| WoodDeckSF | 0.324413 |
| 2ndFlrSF | 0.319334 |
| OpenPorchSF | 0.315856 |
| HalfBath | 0.284108 |
| LotArea | 0.263843 |
| BsmtFullBath | 0.227122 |
| BsmtUnfSF | 0.214479 |
| BedroomAbvGr | 0.168213 |
| ScreenPorch | 0.111447 |
| PoolArea | 0.092404 |
| MoSold | 0.046432 |
| 3SsnPorch | 0.044584 |
| BsmtFinSF2 | -0.011378 |
| BsmtHalfBath | -0.016844 |
| MiscVal | -0.021190 |
| Id | -0.021917 |
| LowQualFinSF | -0.025606 |
| YrSold | -0.028923 |
| OverallCond | -0.077856 |
| MSSubClass | -0.084284 |
| EnclosedPorch | -0.128578 |
| KitchenAbvGr | -0.135907 |
Constructing a List of Columns to Drop:
to_drop: Identify and list columns that have a correlation value less than 0.3 with 'SalePrice'.to_drop_cat: A manually curated list of categorical columns to be removed.- The 'Id' column is explicitly excluded from deletion by removing it from the
to_droplist. - The final
to_droplist is a combination of columns with low correlation, predefined columns into_remove, and the manually curated categorical columns.
In [85]:
Copied!
to_drop = to_drop = list(correlations[correlations['SalePrice'] < 0.3].index)
to_drop_cat = ["Utilities", "RoofStyle", "RoofMatl", "BsmtCond", "BsmtFinType2", "GarageCond", "GarageQual",
"PavedDrive"]
to_drop.remove('Id')
to_drop = to_drop + to_remove + to_drop_cat
to_drop
to_drop = to_drop = list(correlations[correlations['SalePrice'] < 0.3].index)
to_drop_cat = ["Utilities", "RoofStyle", "RoofMatl", "BsmtCond", "BsmtFinType2", "GarageCond", "GarageQual",
"PavedDrive"]
to_drop.remove('Id')
to_drop = to_drop + to_remove + to_drop_cat
to_drop
['HalfBath', 'LotArea', 'BsmtFullBath', 'BsmtUnfSF', 'BedroomAbvGr', 'ScreenPorch', 'PoolArea', 'MoSold', '3SsnPorch', 'BsmtFinSF2', 'BsmtHalfBath', 'MiscVal', 'LowQualFinSF', 'YrSold', 'OverallCond', 'MSSubClass', 'EnclosedPorch', 'KitchenAbvGr', 'PoolQC', 'MiscFeature', 'Alley', 'Fence', 'Utilities', 'RoofStyle', 'RoofMatl', 'BsmtCond', 'BsmtFinType2', 'GarageCond', 'GarageQual', 'PavedDrive']
In [86]:
Copied!
merged = merged.drop(columns=to_drop)
merged
merged = merged.drop(columns=to_drop)
merged
| BsmtFinSF1 | TotalBsmtSF | 1stFlrSF | 2ndFlrSF | GrLivArea | GarageArea | WoodDeckSF | OpenPorchSF | Id | MasVnrArea | ... | HeatingQC | CentralAir | Electrical | KitchenQual | Functional | FireplaceQu | GarageType | GarageFinish | SaleType | SaleCondition | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -0.093430 | 0.187377 | 0.346239 | -0.796237 | -0.410118 | -0.147304 | -0.729387 | -0.695124 | 762.0 | 0.0 | ... | TA | Y | SBrkr | Fa | Min1 | Gd | Detchd | Unf | WD | Normal |
| 1 | -0.352488 | -0.251054 | -0.555627 | -0.796237 | -1.068973 | -0.668893 | -0.729387 | -0.278892 | 897.0 | 0.0 | ... | Ex | N | SBrkr | TA | Typ | Gd | Detchd | Unf | WD | Abnorml |
| 2 | -0.967750 | 0.985141 | 0.853861 | -0.796237 | -0.039277 | 0.937417 | 2.069742 | -0.204565 | 221.0 | 0.0 | ... | Ex | Y | SBrkr | Gd | Typ | Gd | Attchd | RFn | New | Partial |
| 3 | -0.035142 | -0.447671 | -0.779805 | -0.796237 | -1.232746 | -2.178271 | -0.729387 | -0.695124 | 141.0 | 0.0 | ... | TA | Y | SBrkr | TA | Typ | Po | Attchd | Unf | ConLI | Normal |
| 4 | -0.907303 | 0.573830 | 0.384890 | -0.796237 | -0.381881 | -0.345784 | 0.068086 | -0.695124 | 923.0 | 0.0 | ... | Ex | Y | SBrkr | Gd | Typ | Gd | Attchd | Fin | New | Partial |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1163 | -0.967750 | -0.755025 | 0.359123 | -0.796237 | -0.400706 | -2.178271 | 1.367966 | -0.695124 | 251.0 | 0.0 | ... | Ex | Y | SBrkr | Fa | Mod | Gd | Attchd | Unf | WD | Normal |
| 1164 | 0.683744 | -0.101897 | -0.362370 | 1.452250 | 0.918887 | 1.569787 | 0.610367 | 0.077878 | 1443.0 | 160.0 | ... | Ex | Y | SBrkr | Ex | Typ | Ex | Attchd | Fin | WD | Normal |
| 1165 | -0.933209 | 0.420153 | 0.209671 | -0.796237 | -0.509887 | -0.156535 | 0.131883 | -0.695124 | 220.0 | 16.0 | ... | Ex | Y | SBrkr | Gd | Typ | Gd | Attchd | Fin | New | Partial |
| 1166 | 0.569327 | 0.928643 | 0.817786 | -0.796237 | -0.065631 | 0.618925 | -0.729387 | -0.174834 | 705.0 | 109.0 | ... | Ex | Y | SBrkr | Gd | Typ | Gd | Attchd | RFn | WD | Normal |
| 1167 | -0.095589 | -0.800224 | -1.181780 | 0.826524 | -0.193637 | 1.403617 | -0.729387 | 1.817132 | 1421.0 | 420.0 | ... | Gd | Y | SBrkr | TA | Typ | TA | Attchd | RFn | WD | Normal |
1168 rows × 50 columns
Generate binary values using get_dummies¶
One-Hot Encoding for Categorical Data:
- Use the
pd.get_dummies()method to apply one-hot encoding on theCentralAircolumn of themergeddataframe. One-hot encoding is used to convert categorical data into a format that can be provided to machine learning algorithms to improve predictions. In this case, theCentralAircolumn, which indicates whether a property has central air conditioning, is transformed into separate binary columns representing each unique value in the original column. - The transformed dataframe is stored in
dum_df.
In [87]:
Copied!
dum_df = pd.get_dummies(merged, columns=["CentralAir"])
dum_df
dum_df = pd.get_dummies(merged, columns=["CentralAir"])
dum_df
| BsmtFinSF1 | TotalBsmtSF | 1stFlrSF | 2ndFlrSF | GrLivArea | GarageArea | WoodDeckSF | OpenPorchSF | Id | MasVnrArea | ... | Electrical | KitchenQual | Functional | FireplaceQu | GarageType | GarageFinish | SaleType | SaleCondition | CentralAir_N | CentralAir_Y | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -0.093430 | 0.187377 | 0.346239 | -0.796237 | -0.410118 | -0.147304 | -0.729387 | -0.695124 | 762.0 | 0.0 | ... | SBrkr | Fa | Min1 | Gd | Detchd | Unf | WD | Normal | 0 | 1 |
| 1 | -0.352488 | -0.251054 | -0.555627 | -0.796237 | -1.068973 | -0.668893 | -0.729387 | -0.278892 | 897.0 | 0.0 | ... | SBrkr | TA | Typ | Gd | Detchd | Unf | WD | Abnorml | 1 | 0 |
| 2 | -0.967750 | 0.985141 | 0.853861 | -0.796237 | -0.039277 | 0.937417 | 2.069742 | -0.204565 | 221.0 | 0.0 | ... | SBrkr | Gd | Typ | Gd | Attchd | RFn | New | Partial | 0 | 1 |
| 3 | -0.035142 | -0.447671 | -0.779805 | -0.796237 | -1.232746 | -2.178271 | -0.729387 | -0.695124 | 141.0 | 0.0 | ... | SBrkr | TA | Typ | Po | Attchd | Unf | ConLI | Normal | 0 | 1 |
| 4 | -0.907303 | 0.573830 | 0.384890 | -0.796237 | -0.381881 | -0.345784 | 0.068086 | -0.695124 | 923.0 | 0.0 | ... | SBrkr | Gd | Typ | Gd | Attchd | Fin | New | Partial | 0 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1163 | -0.967750 | -0.755025 | 0.359123 | -0.796237 | -0.400706 | -2.178271 | 1.367966 | -0.695124 | 251.0 | 0.0 | ... | SBrkr | Fa | Mod | Gd | Attchd | Unf | WD | Normal | 0 | 1 |
| 1164 | 0.683744 | -0.101897 | -0.362370 | 1.452250 | 0.918887 | 1.569787 | 0.610367 | 0.077878 | 1443.0 | 160.0 | ... | SBrkr | Ex | Typ | Ex | Attchd | Fin | WD | Normal | 0 | 1 |
| 1165 | -0.933209 | 0.420153 | 0.209671 | -0.796237 | -0.509887 | -0.156535 | 0.131883 | -0.695124 | 220.0 | 16.0 | ... | SBrkr | Gd | Typ | Gd | Attchd | Fin | New | Partial | 0 | 1 |
| 1166 | 0.569327 | 0.928643 | 0.817786 | -0.796237 | -0.065631 | 0.618925 | -0.729387 | -0.174834 | 705.0 | 109.0 | ... | SBrkr | Gd | Typ | Gd | Attchd | RFn | WD | Normal | 0 | 1 |
| 1167 | -0.095589 | -0.800224 | -1.181780 | 0.826524 | -0.193637 | 1.403617 | -0.729387 | 1.817132 | 1421.0 | 420.0 | ... | SBrkr | TA | Typ | TA | Attchd | RFn | WD | Normal | 0 | 1 |
1168 rows × 51 columns
Using the LabelEncoder from sklearn's preprocessing module:
- Encode the categorical variables in the
dum_dfdataframe so that they are represented by unique integers. - The result is stored in the
encoded_dfdataframe. - The first five rows of this processed dataframe are displayed.
In [88]:
Copied!
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
encoded_df = dum_df.apply(le.fit_transform)
encoded_df.head()
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
encoded_df = dum_df.apply(le.fit_transform)
encoded_df.head()
| BsmtFinSF1 | TotalBsmtSF | 1stFlrSF | 2ndFlrSF | GrLivArea | GarageArea | WoodDeckSF | OpenPorchSF | Id | MasVnrArea | ... | Electrical | KitchenQual | Functional | FireplaceQu | GarageType | GarageFinish | SaleType | SaleCondition | CentralAir_N | CentralAir_Y | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 155 | 336 | 405 | 0 | 247 | 116 | 0 | 0 | 613 | 0 | ... | 4 | 1 | 2 | 2 | 5 | 2 | 8 | 4 | 0 | 1 |
| 1 | 88 | 214 | 182 | 0 | 69 | 59 | 0 | 17 | 724 | 0 | ... | 4 | 3 | 6 | 2 | 5 | 2 | 8 | 0 | 1 | 0 |
| 2 | 0 | 497 | 497 | 0 | 356 | 272 | 186 | 21 | 192 | 0 | ... | 4 | 2 | 6 | 2 | 1 | 1 | 6 | 5 | 0 | 1 |
| 3 | 172 | 163 | 127 | 0 | 38 | 0 | 0 | 0 | 122 | 0 | ... | 4 | 3 | 6 | 3 | 1 | 2 | 4 | 4 | 0 | 1 |
| 4 | 6 | 423 | 413 | 0 | 255 | 87 | 42 | 0 | 745 | 0 | ... | 4 | 2 | 6 | 2 | 1 | 0 | 6 | 5 | 0 | 1 |
5 rows × 51 columns
OLS¶
In [89]:
Copied!
# generate comment
# We'll apply the Normal Equation, which gives the parameters that minimizes
# the cost function of OLS Linear Regression. We first take the dot product of input
# DataFrame 'encoded_df' and its transpose, and then calculate inverse of the result.
#
# After getting the inverse, we again take the dot product with the transpose of the original dataframe.
# The result of this operation is then multiplied with the target variable 'y_train',
# providing the optimal parameters or weights of each feature for our Linear Model.
#
theta_best = np.linalg.inv(
encoded_df
.T
.dot(encoded_df)
).dot(encoded_df.T).dot(y_train)
# The calculated parameters are then set into a DataFrame for better visualization and interpretation.
# Each parameter represents the effect of the corresponding feature on the predicted output
#
theta_best_df = pd.DataFrame(data=theta_best[np.newaxis, :], columns=encoded_df.columns)
theta_best_df
# generate comment
# We'll apply the Normal Equation, which gives the parameters that minimizes
# the cost function of OLS Linear Regression. We first take the dot product of input
# DataFrame 'encoded_df' and its transpose, and then calculate inverse of the result.
#
# After getting the inverse, we again take the dot product with the transpose of the original dataframe.
# The result of this operation is then multiplied with the target variable 'y_train',
# providing the optimal parameters or weights of each feature for our Linear Model.
#
theta_best = np.linalg.inv(
encoded_df
.T
.dot(encoded_df)
).dot(encoded_df.T).dot(y_train)
# The calculated parameters are then set into a DataFrame for better visualization and interpretation.
# Each parameter represents the effect of the corresponding feature on the predicted output
#
theta_best_df = pd.DataFrame(data=theta_best[np.newaxis, :], columns=encoded_df.columns)
theta_best_df
| BsmtFinSF1 | TotalBsmtSF | 1stFlrSF | 2ndFlrSF | GrLivArea | GarageArea | WoodDeckSF | OpenPorchSF | Id | MasVnrArea | ... | Electrical | KitchenQual | Functional | FireplaceQu | GarageType | GarageFinish | SaleType | SaleCondition | CentralAir_N | CentralAir_Y | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 44.665647 | 31.289266 | 73.37586 | 177.117183 | -13.502647 | 22.308273 | 55.19069 | 2.696359 | -1.55409 | 40.913455 | ... | -159.396563 | -8999.825656 | 4458.255439 | -2175.285665 | 968.702478 | -483.39908 | -336.922241 | 2127.160181 | 50935.665186 | 59144.313219 |
1 rows × 51 columns
In [90]:
Copied!
# generate comment in markdown
def dropAndEncode(merged):
to_drop = ['HalfBath',
'LotArea',
'BsmtFullBath',
'BsmtUnfSF',
'BedroomAbvGr',
'ScreenPorch',
'PoolArea',
'MoSold',
'3SsnPorch',
'BsmtFinSF2',
'BsmtHalfBath',
'MiscVal',
'LowQualFinSF',
'YrSold',
'OverallCond',
'MSSubClass',
'EnclosedPorch',
'KitchenAbvGr',
'Utilities',
'RoofStyle',
'RoofMatl',
'BsmtCond',
'BsmtFinType2',
'GarageCond',
'GarageQual',
'PavedDrive']
# Drop the specified features from the dataframe
merged = merged.drop(columns=to_drop)
# Perform one-hot encoding on the 'CentralAir' feature
final_df = pd.get_dummies(merged, columns=["CentralAir"])
# Label Encode the dataframe, converting categorical data into numbers.
le = preprocessing.LabelEncoder()
final_df = final_df.apply(le.fit_transform)
return final_df
# generate comment in markdown
def dropAndEncode(merged):
to_drop = ['HalfBath',
'LotArea',
'BsmtFullBath',
'BsmtUnfSF',
'BedroomAbvGr',
'ScreenPorch',
'PoolArea',
'MoSold',
'3SsnPorch',
'BsmtFinSF2',
'BsmtHalfBath',
'MiscVal',
'LowQualFinSF',
'YrSold',
'OverallCond',
'MSSubClass',
'EnclosedPorch',
'KitchenAbvGr',
'Utilities',
'RoofStyle',
'RoofMatl',
'BsmtCond',
'BsmtFinType2',
'GarageCond',
'GarageQual',
'PavedDrive']
# Drop the specified features from the dataframe
merged = merged.drop(columns=to_drop)
# Perform one-hot encoding on the 'CentralAir' feature
final_df = pd.get_dummies(merged, columns=["CentralAir"])
# Label Encode the dataframe, converting categorical data into numbers.
le = preprocessing.LabelEncoder()
final_df = final_df.apply(le.fit_transform)
return final_df
Handling Missing Values and Normalizing Numerical Data:
- The function
num_imputetakes in four parameters:numerical_norm,numerical_rest,other_numerical, andto_normalise. - Initially, the
SimpleImputeris utilized to handle missing data. For numerical data, a common strategy is to replace missing values with the mean of the attribute. This ensures that the imputed values don't introduce any bias that could impact the downstream modeling process. - After imputation, the
StandardScaleris employed to normalize thenumerical_normdata. Scaling numerical features is crucial for algorithms that rely on distance metrics or gradient descent, as it ensures all features have the same scale. This normalization is performed by removing the mean and scaling to unit variance. - Both the normalized and the remaining numerical data are structured into dataframes for easy management and interpretation.
- The function returns two dataframes: one with normalized values (
numerical_norm) and the other with imputed values (numerical_rest).
In [91]:
Copied!
def num_impute(numerical_norm, numerical_rest, other_numerical, to_normalise):
# Using SimpleImputer to replace the missing values with the attributes' means
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
numerical_norm = imputer.fit_transform(numerical_norm)
numerical_rest = imputer.fit_transform(numerical_rest)
# To normalise the numerical attributes, we use StandardScaler
scaler = StandardScaler()
numerical_norm = scaler.fit_transform(numerical_norm)
numerical_norm = pd.DataFrame(numerical_norm, columns=to_normalise)
numerical_rest = pd.DataFrame(numerical_rest, columns=other_numerical)
return numerical_norm, numerical_rest
def num_impute(numerical_norm, numerical_rest, other_numerical, to_normalise):
# Using SimpleImputer to replace the missing values with the attributes' means
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
numerical_norm = imputer.fit_transform(numerical_norm)
numerical_rest = imputer.fit_transform(numerical_rest)
# To normalise the numerical attributes, we use StandardScaler
scaler = StandardScaler()
numerical_norm = scaler.fit_transform(numerical_norm)
numerical_norm = pd.DataFrame(numerical_norm, columns=to_normalise)
numerical_rest = pd.DataFrame(numerical_rest, columns=other_numerical)
return numerical_norm, numerical_rest
Separating and Structuring Numerical and Categorical Attributes:
- The function
separate_numis designed to segregate and structure the numerical and categorical attributes from a given dataframedf. - First, it determines which attributes are numerical (those not of type "object") and which are categorical (those of type "object").
- The dataframe is then split into
df_numericalcontaining only the numerical attributes anddf_categoricalcontaining only the categorical ones. - Out of the numerical attributes, some are specifically chosen to be normalized, as listed in
to_normalise. These are features that might benefit from normalization due to their range or distribution. - The remaining numerical attributes are categorized under
other_numerical. - The function returns the following six outputs:
categorical_attr: A list of column names that are categorical.df_categorical: Dataframe containing only the categorical attributes.numerical_norm: Dataframe of the selected numerical attributes that are to be normalized.numerical_rest: Dataframe of the other numerical attributes not chosen for normalization.other_numerical: A list of column names from the numerical attributes that are not being normalized.to_normalise: A list of column names of the numerical attributes that are to be normalized.
The separation of attributes into different data structures simplifies subsequent preprocessing steps like normalization and encoding.
In [92]:
Copied!
def separate_num(df):
numerical_attr = df.dtypes[df.dtypes != "object"].index
categorical_attr = df.dtypes[df.dtypes == "object"].index
df_numerical = df[numerical_attr]
df_categorical = df[categorical_attr]
to_normalise = ["MSSubClass", "BsmtFinSF1", "BsmtUnfSF", "TotalBsmtSF", "1stFlrSF", "2ndFlrSF", "LowQualFinSF",
"GrLivArea", "GarageArea", "WoodDeckSF", "OpenPorchSF", "EnclosedPorch", "3SsnPorch", "ScreenPorch",
"PoolArea"]
other_numerical = list(set(numerical_attr) - set(to_normalise))
numerical_norm = df_numerical[to_normalise]
numerical_rest = df_numerical[other_numerical]
return categorical_attr, df_categorical, numerical_norm, numerical_rest, other_numerical, to_normalise
def separate_num(df):
numerical_attr = df.dtypes[df.dtypes != "object"].index
categorical_attr = df.dtypes[df.dtypes == "object"].index
df_numerical = df[numerical_attr]
df_categorical = df[categorical_attr]
to_normalise = ["MSSubClass", "BsmtFinSF1", "BsmtUnfSF", "TotalBsmtSF", "1stFlrSF", "2ndFlrSF", "LowQualFinSF",
"GrLivArea", "GarageArea", "WoodDeckSF", "OpenPorchSF", "EnclosedPorch", "3SsnPorch", "ScreenPorch",
"PoolArea"]
other_numerical = list(set(numerical_attr) - set(to_normalise))
numerical_norm = df_numerical[to_normalise]
numerical_rest = df_numerical[other_numerical]
return categorical_attr, df_categorical, numerical_norm, numerical_rest, other_numerical, to_normalise
In [93]:
Copied!
def pre_remove(df):
missing_df = pd.DataFrame()
missing_df["Feature"] = train_df.columns
# Calculating the percentage of the missing values for each attribute
missing = ((train_df.isnull().sum() / len(train_df)) * 100).values
missing_df["Missing"] = missing
missing_df = missing_df[missing_df["Missing"] != 0]
missing_df = missing_df.sort_values(by="Missing", ascending=False)
attributes = missing_df.loc[missing_df['Missing'] > 50]
to_remove = list(attributes['Feature'])
df = remove_missing(df, to_remove)
return df
def pre_remove(df):
missing_df = pd.DataFrame()
missing_df["Feature"] = train_df.columns
# Calculating the percentage of the missing values for each attribute
missing = ((train_df.isnull().sum() / len(train_df)) * 100).values
missing_df["Missing"] = missing
missing_df = missing_df[missing_df["Missing"] != 0]
missing_df = missing_df.sort_values(by="Missing", ascending=False)
attributes = missing_df.loc[missing_df['Missing'] > 50]
to_remove = list(attributes['Feature'])
df = remove_missing(df, to_remove)
return df
In [94]:
Copied!
def categorical_impute(categorical_attr, df_categorical):
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imputer = imputer.fit(df_categorical)
df_categorical = imputer.transform(df_categorical)
df_categorical = pd.DataFrame(df_categorical, columns=categorical_attr)
return df_categorical
def categorical_impute(categorical_attr, df_categorical):
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imputer = imputer.fit(df_categorical)
df_categorical = imputer.transform(df_categorical)
df_categorical = pd.DataFrame(df_categorical, columns=categorical_attr)
return df_categorical
In [95]:
Copied!
def preprocess(df):
# Remove missing values
df = pre_remove(df)
# Separate numerical - categorical
categorical_attr, df_categorical, numerical_norm, numerical_rest, other_numerical, to_normalise = separate_num(df)
numerical_norm, numerical_rest = num_impute(numerical_norm, numerical_rest, other_numerical, to_normalise)
# For the categorical attributes, we replace the missing values by the most frequent value in a column using SimpleImputer
df_categorical = categorical_impute(categorical_attr, df_categorical)
# Merging the normalised numerical dataframe with the untouched features
df_numerical = numerical_norm.join(numerical_rest, how="inner")
# Merging the categorical and numerical dataframes
merged = df_numerical.join(df_categorical, how="inner")
# Drop unnecessary columns
final_df = dropAndEncode(merged)
return final_df
def preprocess(df):
# Remove missing values
df = pre_remove(df)
# Separate numerical - categorical
categorical_attr, df_categorical, numerical_norm, numerical_rest, other_numerical, to_normalise = separate_num(df)
numerical_norm, numerical_rest = num_impute(numerical_norm, numerical_rest, other_numerical, to_normalise)
# For the categorical attributes, we replace the missing values by the most frequent value in a column using SimpleImputer
df_categorical = categorical_impute(categorical_attr, df_categorical)
# Merging the normalised numerical dataframe with the untouched features
df_numerical = numerical_norm.join(numerical_rest, how="inner")
# Merging the categorical and numerical dataframes
merged = df_numerical.join(df_categorical, how="inner")
# Drop unnecessary columns
final_df = dropAndEncode(merged)
return final_df
In [96]:
Copied!
valid_processed = preprocess(X_valid)
test_processed = preprocess(test_df)
valid_processed = preprocess(X_valid)
test_processed = preprocess(test_df)
In [97]:
Copied!
valid_processed.shape
valid_processed.shape
(292, 51)
In [98]:
Copied!
# Generate predictions on the new prices
y_valid_pred = valid_processed.dot(theta_best)
y_test_pred = test_processed.dot(theta_best)
y_test_pred
# Generate predictions on the new prices
y_valid_pred = valid_processed.dot(theta_best)
y_test_pred = test_processed.dot(theta_best)
y_test_pred
0 114782.528533
1 185004.005310
2 187284.919910
3 204908.719874
4 201431.165486
...
1454 52639.620850
1455 55770.834108
1456 188828.567119
1457 124047.724754
1458 286234.587270
Length: 1459, dtype: float64
In [99]:
Copied!
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
r2_score(y_valid, y_valid_pred)
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
r2_score(y_valid, y_valid_pred)
0.5161761297251882
In [100]:
Copied!
mean_squared_error(y_valid, y_valid_pred)
mean_squared_error(y_valid, y_valid_pred)
2772362722.5048304
In [101]:
Copied!
df = pd.concat([test_df['Id'], y_test_pred], axis=1)
df = df.rename(columns={0: "SalePrice"})
df.to_csv('house.csv', index=False)
df = pd.concat([test_df['Id'], y_test_pred], axis=1)
df = df.rename(columns={0: "SalePrice"})
df.to_csv('house.csv', index=False)
In [ ]:
Copied!