HW1 Part2
In [18]:
Copied!
# data analysis and preparing
import pandas as pd
import numpy as np
# visualization
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
# data analysis and preparing
import pandas as pd
import numpy as np
# visualization
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
In [19]:
Copied!
train_df = pd.read_csv("HW1-Part2/train.csv").set_index("PassengerId", drop=True)
test_df = pd.read_csv("HW1-Part2/test.csv").set_index("PassengerId", drop=True)
train_df
train_df = pd.read_csv("HW1-Part2/train.csv").set_index("PassengerId", drop=True)
test_df = pd.read_csv("HW1-Part2/test.csv").set_index("PassengerId", drop=True)
train_df
Out[19]:
| Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| PassengerId | |||||||||||
| 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
| 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
| 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
| 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
| 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 887 | 0 | 2 | Montvila, Rev. Juozas | male | 27.0 | 0 | 0 | 211536 | 13.0000 | NaN | S |
| 888 | 1 | 1 | Graham, Miss. Margaret Edith | female | 19.0 | 0 | 0 | 112053 | 30.0000 | B42 | S |
| 889 | 0 | 3 | Johnston, Miss. Catherine Helen "Carrie" | female | NaN | 1 | 2 | W./C. 6607 | 23.4500 | NaN | S |
| 890 | 1 | 1 | Behr, Mr. Karl Howell | male | 26.0 | 0 | 0 | 111369 | 30.0000 | C148 | C |
| 891 | 0 | 3 | Dooley, Mr. Patrick | male | 32.0 | 0 | 0 | 370376 | 7.7500 | NaN | Q |
891 rows × 11 columns
In [20]:
Copied!
from sklearn.model_selection import train_test_split
X = train_df.drop(columns=['Survived']).copy()
y = train_df['Survived']
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8)\
print(X_train.shape), print(y_train.shape)
print(X_valid.shape), print(y_valid.shape)
from sklearn.model_selection import train_test_split
X = train_df.drop(columns=['Survived']).copy()
y = train_df['Survived']
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8)\
print(X_train.shape), print(y_train.shape)
print(X_valid.shape), print(y_valid.shape)
(712, 10) (712,) (179, 10) (179,)
Out[20]:
(None, None)
In [21]:
Copied!
# Creating a new empty dataframe
missing_df = pd.DataFrame()
missing_df["Feature"] = X_train.columns
# Calculating the percentage of the missing values for each attribute
missing_df["Missing"] = ((X_train.isnull().sum() / len(X_train)) * 100).values
missing_df = missing_df[missing_df["Missing"] != 0]
missing_df = missing_df.sort_values(by="Missing", ascending=False)
plt.figure(figsize=(12, 5))
g = sns.barplot(data=missing_df, x="Feature", y="Missing", color="blue")
g.set_xticklabels(g.get_xticklabels(), rotation=30);
# Creating a new empty dataframe
missing_df = pd.DataFrame()
missing_df["Feature"] = X_train.columns
# Calculating the percentage of the missing values for each attribute
missing_df["Missing"] = ((X_train.isnull().sum() / len(X_train)) * 100).values
missing_df = missing_df[missing_df["Missing"] != 0]
missing_df = missing_df.sort_values(by="Missing", ascending=False)
plt.figure(figsize=(12, 5))
g = sns.barplot(data=missing_df, x="Feature", y="Missing", color="blue")
g.set_xticklabels(g.get_xticklabels(), rotation=30);
Attributes with more than 50% missing is dropped to avoid the imputation process from impeding the model performance.
In [22]:
Copied!
to_remove = list(missing_df.loc[missing_df['Missing'] > 50 ]['Feature'])
def remove_missing(df, to_remove):
return df.drop(columns=to_remove)
to_remove = list(missing_df.loc[missing_df['Missing'] > 50 ]['Feature'])
def remove_missing(df, to_remove):
return df.drop(columns=to_remove)
Name and ticket attribute which does not provide sufficient information on the possibility of surviving titanic is dropped.
In [23]:
Copied!
X_train = X_train.drop(columns=["Name", "Ticket"], axis=1)
X_train = X_train.drop(columns=["Name", "Ticket"], axis=1)
In [24]:
Copied!
X_train = remove_missing(X_train, to_remove)
X_train
X_train = remove_missing(X_train, to_remove)
X_train
Out[24]:
| Pclass | Sex | Age | SibSp | Parch | Fare | Embarked | |
|---|---|---|---|---|---|---|---|
| PassengerId | |||||||
| 772 | 3 | male | 48.0 | 0 | 0 | 7.8542 | S |
| 640 | 3 | male | NaN | 1 | 0 | 16.1000 | S |
| 606 | 3 | male | 36.0 | 1 | 0 | 15.5500 | S |
| 156 | 1 | male | 51.0 | 0 | 1 | 61.3792 | C |
| 642 | 1 | female | 24.0 | 0 | 0 | 69.3000 | C |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 367 | 1 | female | 60.0 | 1 | 0 | 75.2500 | C |
| 491 | 3 | male | NaN | 1 | 0 | 19.9667 | S |
| 629 | 3 | male | 26.0 | 0 | 0 | 7.8958 | S |
| 65 | 1 | male | NaN | 0 | 0 | 27.7208 | C |
| 538 | 1 | female | 30.0 | 0 | 0 | 106.4250 | C |
712 rows × 7 columns
As the Titanic data set only provided a very limited number of attributes, all the other attributes are used.
In [25]:
Copied!
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
numerical_attr = ['Age', 'SibSp', 'Parch', 'Fare']
categorical_attr = ['Pclass', 'Sex', 'Embarked']
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
X_train[numerical_attr] = imputer.fit_transform(X_train[numerical_attr])
scaler = StandardScaler()
X_train[numerical_attr] = scaler.fit_transform(X_train[numerical_attr])
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
X_train[categorical_attr] = imputer.fit_transform(X_train[categorical_attr])
X_train
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
numerical_attr = ['Age', 'SibSp', 'Parch', 'Fare']
categorical_attr = ['Pclass', 'Sex', 'Embarked']
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
X_train[numerical_attr] = imputer.fit_transform(X_train[numerical_attr])
scaler = StandardScaler()
X_train[numerical_attr] = scaler.fit_transform(X_train[numerical_attr])
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
X_train[categorical_attr] = imputer.fit_transform(X_train[categorical_attr])
X_train
Out[25]:
| Pclass | Sex | Age | SibSp | Parch | Fare | Embarked | |
|---|---|---|---|---|---|---|---|
| PassengerId | |||||||
| 772 | 3 | male | 1.460435 | -0.460968 | -0.466341 | -0.495175 | S |
| 640 | 3 | male | 0.000000 | 0.418949 | -0.466341 | -0.328272 | S |
| 606 | 3 | male | 0.505604 | 0.418949 | -0.466341 | -0.339404 | S |
| 156 | 1 | male | 1.699143 | -0.460968 | 0.791366 | 0.588225 | C |
| 642 | 1 | female | -0.449226 | -0.460968 | -0.466341 | 0.748549 | C |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 367 | 1 | female | 2.415266 | 0.418949 | -0.466341 | 0.868983 | C |
| 491 | 3 | male | 0.000000 | 0.418949 | -0.466341 | -0.250006 | S |
| 629 | 3 | male | -0.290088 | -0.460968 | -0.466341 | -0.494333 | S |
| 65 | 1 | male | 0.000000 | -0.460968 | -0.466341 | -0.093055 | C |
| 538 | 1 | female | 0.028189 | -0.460968 | -0.466341 | 1.499996 | C |
712 rows × 7 columns
In [26]:
Copied!
fig = pd.melt(X_train)
a1 = sns.FacetGrid(fig, col="variable", col_wrap=5, sharex=False, sharey=False, height=2, palette="Set1")
a1 = a1.map(sns.histplot, "value", color='red')
plt.show()
fig = pd.melt(X_train)
a1 = sns.FacetGrid(fig, col="variable", col_wrap=5, sharex=False, sharey=False, height=2, palette="Set1")
a1 = a1.map(sns.histplot, "value", color='red')
plt.show()
In [27]:
Copied!
X_train = pd.get_dummies(X_train, columns=categorical_attr)
X_train.describe()
X_train = pd.get_dummies(X_train, columns=categorical_attr)
X_train.describe()
Out[27]:
| Age | SibSp | Parch | Fare | Pclass_1 | Pclass_2 | Pclass_3 | Sex_female | Sex_male | Embarked_C | Embarked_Q | Embarked_S | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 7.120000e+02 | 7.120000e+02 | 7.120000e+02 | 7.120000e+02 | 712.000000 | 712.000000 | 712.000000 | 712.000000 | 712.000000 | 712.000000 | 712.000000 | 712.000000 |
| mean | 9.711528e-17 | 1.024461e-16 | -4.210115e-18 | 5.675859e-17 | 0.245787 | 0.207865 | 0.546348 | 0.352528 | 0.647472 | 0.196629 | 0.087079 | 0.716292 |
| std | 1.000703e+00 | 1.000703e+00 | 1.000703e+00 | 1.000703e+00 | 0.430855 | 0.406065 | 0.498197 | 0.478093 | 0.478093 | 0.397729 | 0.282148 | 0.451114 |
| min | -2.325469e+00 | -4.609680e-01 | -4.663409e-01 | -6.541516e-01 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | -6.083649e-01 | -4.609680e-01 | -4.663409e-01 | -4.937417e-01 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 50% | 0.000000e+00 | -4.609680e-01 | -4.663409e-01 | -3.615842e-01 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 1.000000 |
| 75% | 4.260352e-01 | 4.189494e-01 | -4.663409e-01 | -3.581041e-02 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 1.000000 |
| max | 4.006651e+00 | 6.578371e+00 | 5.822195e+00 | 9.715901e+00 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
In [28]:
Copied!
def preprocess(df):
# Remove missing values
missing_df = pd.DataFrame()
missing_df["Feature"] = train_df.columns
# Calculating the percentage of the missing values for each attribute
missing = ((train_df.isnull().sum() / len(train_df)) * 100).values
missing_df["Missing"] = missing
missing_df = missing_df[missing_df["Missing"] != 0]
missing_df = missing_df.sort_values(by="Missing", ascending=False)
to_remove = list(missing_df.loc[missing_df['Missing'] > 50 ]['Feature'])
df = remove_missing(df, to_remove)
# Drop unnecessary columns
df = df.drop(columns=["Name", "Ticket"], axis=1)
# Replace the missing values with mean/most frequent
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
numerical_attr = ['Age', 'SibSp', 'Parch', 'Fare']
categorical_attr = ['Pclass', 'Sex', 'Embarked']
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
df[numerical_attr] = imputer.fit_transform(df[numerical_attr])
scaler = StandardScaler()
df[numerical_attr] = scaler.fit_transform(df[numerical_attr])
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
df[categorical_attr] = imputer.fit_transform(df[categorical_attr])
# One-hot encoding
df = pd.get_dummies(df, columns=['Pclass', 'Sex', 'Embarked'])
return df
def preprocess(df):
# Remove missing values
missing_df = pd.DataFrame()
missing_df["Feature"] = train_df.columns
# Calculating the percentage of the missing values for each attribute
missing = ((train_df.isnull().sum() / len(train_df)) * 100).values
missing_df["Missing"] = missing
missing_df = missing_df[missing_df["Missing"] != 0]
missing_df = missing_df.sort_values(by="Missing", ascending=False)
to_remove = list(missing_df.loc[missing_df['Missing'] > 50 ]['Feature'])
df = remove_missing(df, to_remove)
# Drop unnecessary columns
df = df.drop(columns=["Name", "Ticket"], axis=1)
# Replace the missing values with mean/most frequent
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
numerical_attr = ['Age', 'SibSp', 'Parch', 'Fare']
categorical_attr = ['Pclass', 'Sex', 'Embarked']
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
df[numerical_attr] = imputer.fit_transform(df[numerical_attr])
scaler = StandardScaler()
df[numerical_attr] = scaler.fit_transform(df[numerical_attr])
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
df[categorical_attr] = imputer.fit_transform(df[categorical_attr])
# One-hot encoding
df = pd.get_dummies(df, columns=['Pclass', 'Sex', 'Embarked'])
return df
In [29]:
Copied!
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(penalty="l2")
model.fit(X_train,y_train)
y_valid_pred = model.predict(preprocess(X_valid))
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(penalty="l2")
model.fit(X_train,y_train)
y_valid_pred = model.predict(preprocess(X_valid))
In [30]:
Copied!
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('accuracy score', accuracy_score(y_valid, y_valid_pred))
print('precision score', precision_score(y_valid, y_valid_pred))
print('recall score', recall_score(y_valid, y_valid_pred))
print('f1 score', f1_score(y_valid, y_valid_pred))
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('accuracy score', accuracy_score(y_valid, y_valid_pred))
print('precision score', precision_score(y_valid, y_valid_pred))
print('recall score', recall_score(y_valid, y_valid_pred))
print('f1 score', f1_score(y_valid, y_valid_pred))
accuracy score 0.8212290502793296 precision score 0.78125 recall score 0.7352941176470589 f1 score 0.7575757575757576
In [31]:
Copied!
X_test = preprocess(test_df)
y_test_pred = model.predict(X_test)
X_test = preprocess(test_df)
y_test_pred = model.predict(X_test)
In [32]:
Copied!
test_pred = X_test.copy()
test_pred['Survived'] = y_test_pred
submission = test_pred[['Survived']]
submission.to_csv('HW1-Part2/titanic_submission.csv')
test_pred = X_test.copy()
test_pred['Survived'] = y_test_pred
submission = test_pred[['Survived']]
submission.to_csv('HW1-Part2/titanic_submission.csv')
In [32]:
Copied!