Machine Learning Classification ( Bank Marketing Dataset )
Machine Learning Classification ( Bank Marketing Dataset )
Code :
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings("ignore")
def load_data():
"""
Load csv file
Place the csv file in the current directory ( filename -> bank.csv )
"""
FILENAME = "bank-full.csv"
df = pd.read_csv(FILENAME, delimiter=";")
return df
def write_to_file(df, filename="buffer.csv"):
print("Writing Dataframe to file : ", filename)
df.to_csv(filename)
return
def preprocess(df):
"""
- Drop duplicates
- Encode categorical features
- Convert target column 'y' -> 1/0
- Scale all numeric columns
"""
df = df.drop_duplicates()
# Encode target variable
df["target"] = OrdinalEncoder(categories=[["no", "yes"]]).fit_transform(df[["y"]])
# Encode categorical features
ordinal_cols = ["education", "month"]
CATEGORY_ENCODING_COLS = [
"job",
"marital",
"default",
"housing",
"loan",
"poutcome",
"contact",
]
# OrdinalEncoding
df[ordinal_cols] = OrdinalEncoder().fit_transform(df[ordinal_cols])
# One Hot Encoding
df = pd.get_dummies(df, columns=CATEGORY_ENCODING_COLS, dtype=np.uint8)
# Scale all numeric columns
numeric_cols = [
"age",
"balance",
"day",
"duration",
"campaign",
"pdays",
"previous",
]
ss = StandardScaler(copy=False)
df[numeric_cols] = ss.fit_transform(df[numeric_cols])
return df
def create_train_test_split_dataset(df):
"""
Create train-test split with stratification to maintain class ratio
"""
x = df.drop(columns=["y", "target"], axis=1)
y = df["target"]
rv = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)
return rv
# Logistic Regression
def train_logistic_regression_classifier(lr, train_test_data_split):
X_train, X_test, y_train, y_test = train_test_data_split
lr.fit(X_train, y_train)
return lr
def test_logistic_regression_classifier(lr, train_test_data_split):
X_train, X_test, y_train, y_test = train_test_data_split
lr_y_pred = lr.predict(X_test)
return lr_y_pred
def compute_metrics_logistic_regression(train_test_data_split, y_pred):
X_train, X_test, y_train, y_test = train_test_data_split
print(" Logistic Regression Report")
print(classification_report(y_test, y_pred))
# Random Forest Classifier
def train_random_forest_classifier(rfc, train_test_data_split):
X_train, X_test, y_train, y_test = train_test_data_split
rfc.fit(X_train, y_train)
return rfc
def test_random_forest_classifier(rfc, train_test_data_split):
X_train, X_test, y_train, y_test = train_test_data_split
rfc_y_pred = rfc.predict(X_test)
return rfc_y_pred
def compute_metrics_random_forest_classifier(train_test_data_split, y_pred):
X_train, X_test, y_train, y_test = train_test_data_split
print("Random Forest Classifier Report")
print(classification_report(y_test, y_pred))
def execute():
df = load_data()
df = preprocess(df)
train_test_data_split = create_train_test_split_dataset(df)
# Logistic Regression
lr = LogisticRegression(class_weight="balanced", random_state=42)
lr = train_logistic_regression_classifier(lr, train_test_data_split)
lr_y_pred = test_logistic_regression_classifier(lr, train_test_data_split)
compute_metrics_logistic_regression(train_test_data_split, lr_y_pred)
# Random Forest
rfc = RandomForestClassifier(
n_estimators=200,
class_weight="balanced",
random_state=42,
)
rfc = train_random_forest_classifier(rfc, train_test_data_split)
rfc_y_pred = test_random_forest_classifier(rfc, train_test_data_split)
compute_metrics_random_forest_classifier(train_test_data_split, rfc_y_pred)
execute()