Machine Learning Classification ( Bank Marketing Dataset )

Machine Learning Classification ( Bank Marketing Dataset )

Code :

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import warnings

warnings.filterwarnings("ignore")


def load_data():
    """
    Load csv file
    Place the csv file in the current directory ( filename -> bank.csv )
    """
    FILENAME = "bank-full.csv"
    df = pd.read_csv(FILENAME, delimiter=";")
    return df


def write_to_file(df, filename="buffer.csv"):
    print("Writing Dataframe to file : ", filename)
    df.to_csv(filename)
    return


def preprocess(df):
    """
    - Drop duplicates
    - Encode categorical features
    - Convert target column 'y' -> 1/0
    - Scale all numeric columns
    """

    df = df.drop_duplicates()

    # Encode target variable
    df["target"] = OrdinalEncoder(categories=[["no", "yes"]]).fit_transform(df[["y"]])

    # Encode categorical features
    ordinal_cols = ["education", "month"]
    CATEGORY_ENCODING_COLS = [
        "job",
        "marital",
        "default",
        "housing",
        "loan",
        "poutcome",
        "contact",
    ]

    # OrdinalEncoding
    df[ordinal_cols] = OrdinalEncoder().fit_transform(df[ordinal_cols])

    # One Hot Encoding
    df = pd.get_dummies(df, columns=CATEGORY_ENCODING_COLS, dtype=np.uint8)

    # Scale all numeric columns
    numeric_cols = [
        "age",
        "balance",
        "day",
        "duration",
        "campaign",
        "pdays",
        "previous",
    ]

    ss = StandardScaler(copy=False)
    df[numeric_cols] = ss.fit_transform(df[numeric_cols])

    return df


def create_train_test_split_dataset(df):
    """
    Create train-test split with stratification to maintain class ratio
    """
    x = df.drop(columns=["y", "target"], axis=1)
    y = df["target"]
    rv = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)
    return rv


# Logistic Regression
def train_logistic_regression_classifier(lr, train_test_data_split):
    X_train, X_test, y_train, y_test = train_test_data_split
    lr.fit(X_train, y_train)
    return lr


def test_logistic_regression_classifier(lr, train_test_data_split):
    X_train, X_test, y_train, y_test = train_test_data_split
    lr_y_pred = lr.predict(X_test)
    return lr_y_pred


def compute_metrics_logistic_regression(train_test_data_split, y_pred):
    X_train, X_test, y_train, y_test = train_test_data_split
    print(" Logistic Regression Report")
    print(classification_report(y_test, y_pred))


# Random Forest Classifier
def train_random_forest_classifier(rfc, train_test_data_split):
    X_train, X_test, y_train, y_test = train_test_data_split
    rfc.fit(X_train, y_train)
    return rfc


def test_random_forest_classifier(rfc, train_test_data_split):
    X_train, X_test, y_train, y_test = train_test_data_split
    rfc_y_pred = rfc.predict(X_test)
    return rfc_y_pred


def compute_metrics_random_forest_classifier(train_test_data_split, y_pred):
    X_train, X_test, y_train, y_test = train_test_data_split
    print("Random Forest Classifier Report")
    print(classification_report(y_test, y_pred))


def execute():
    df = load_data()
    df = preprocess(df)
    train_test_data_split = create_train_test_split_dataset(df)

    # Logistic Regression
    lr = LogisticRegression(class_weight="balanced", random_state=42)
    lr = train_logistic_regression_classifier(lr, train_test_data_split)
    lr_y_pred = test_logistic_regression_classifier(lr, train_test_data_split)
    compute_metrics_logistic_regression(train_test_data_split, lr_y_pred)

    # Random Forest
    rfc = RandomForestClassifier(
        n_estimators=200,
        class_weight="balanced",
        random_state=42,
    )
    rfc = train_random_forest_classifier(rfc, train_test_data_split)
    rfc_y_pred = test_random_forest_classifier(rfc, train_test_data_split)
    compute_metrics_random_forest_classifier(train_test_data_split, rfc_y_pred)


execute()