statistics for data science and ai

statistics for data science and ai.

descriptive statistics : mean, median, measures of spread, variance, standard deviation

correlation is a measure of the relationship between two quantitative variables.

positive correlation
no correlation
negative correlation

pearson correlation

if corr value is close to 1, then positive correlation if corr value is close to 0, then no correlation if corr value is close to -1, then negative correlation

import pandas as pd
from scipy import stats
from sklearn.feature_selection import r_regression


def compute_pearson_corr(filename):
    df = pd.read_csv(filename)
    print(f"df : {df}")
    salary = df["Salary"]
    bonus = df["Bonus"]
    correlation_coefficient, p_value = stats.pearsonr(salary, bonus)

    corr_eff = r_regression(df[["Salary"]], df["Bonus"])

    print(f"corr_eff : {corr_eff}")

    print(f"p_value : {p_value}")
    print(f"correlation_coefficient : {correlation_coefficient}")

    return


filename = "./employment_records.csv"
compute_pearson_corr(filename)

sampling
estimating unknown parameters
confidence interval

hypothesis testing

classification program :

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder
import warnings

warnings.filterwarnings("ignore")


def create_high_bonus_col(df):
    df["high_bonus"] = df["Bonus"].apply(lambda x: 1 if x > 2000 else 0)
    return df


def prep_cols(df):
    x = df[["Gender", "Nationality", "Marital Status", "Job Title", "Salary"]].copy()
    y = df["high_bonus"]

    le = LabelEncoder()
    for col in ["Gender", "Nationality", "Marital Status", "Job Title"]:
        x[col] = le.fit_transform(x[col].astype(str))

    return x, y


def get_train_test_split(x, y):
    return train_test_split(x, y, test_size=0.2, random_state=42)


def execute(f):
    df = pd.read_csv(f)
    df = create_high_bonus_col(df)
    x, y = prep_cols(df)
    x_train, x_test, y_train, y_test = get_train_test_split(x, y)
    df = classifier(df, x, x_train, x_test, y_train, y_test)
    df.to_csv("output.csv")
    print(df)


def classifier(df, x, x_train, x_test, y_train, y_test):
    model = GaussianNB()
    model.fit(x_train, y_train)

    # Predict using the encoded full dataset
    y_pred_full = model.predict(x)
    df["predicted_value"] = y_pred_full

    return df


filename = "./employee.csv"
execute(filename)