Skip to content

statistics for data science and ai

statistics for data science and ai.

  • descriptive statistics : mean, median, measures of spread, variance, standard deviation

correlation is a measure of the relationship between two quantitative variables.

  • positive correlation
  • no correlation
  • negative correlation

pearson correlation

if corr value is close to 1, then positive correlation if corr value is close to 0, then no correlation if corr value is close to -1, then negative correlation

import pandas as pd
from scipy import stats
from sklearn.feature_selection import r_regression


def compute_pearson_corr(filename):
    df = pd.read_csv(filename)
    print(f"df : {df}")
    salary = df["Salary"]
    bonus = df["Bonus"]
    correlation_coefficient, p_value = stats.pearsonr(salary, bonus)

    corr_eff = r_regression(df[["Salary"]], df["Bonus"])

    print(f"corr_eff : {corr_eff}")

    print(f"p_value : {p_value}")
    print(f"correlation_coefficient : {correlation_coefficient}")

    return


filename = "./employment_records.csv"
compute_pearson_corr(filename)
  • sampling

  • estimating unknown parameters

  • confidence interval

hypothesis testing

classification program :

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder
import warnings

warnings.filterwarnings("ignore")


def create_high_bonus_col(df):
    df["high_bonus"] = df["Bonus"].apply(lambda x: 1 if x > 2000 else 0)
    return df


def prep_cols(df):
    x = df[["Gender", "Nationality", "Marital Status", "Job Title", "Salary"]].copy()
    y = df["high_bonus"]

    le = LabelEncoder()
    for col in ["Gender", "Nationality", "Marital Status", "Job Title"]:
        x[col] = le.fit_transform(x[col].astype(str))

    return x, y


def get_train_test_split(x, y):
    return train_test_split(x, y, test_size=0.2, random_state=42)


def execute(f):
    df = pd.read_csv(f)
    df = create_high_bonus_col(df)
    x, y = prep_cols(df)
    x_train, x_test, y_train, y_test = get_train_test_split(x, y)
    df = classifier(df, x, x_train, x_test, y_train, y_test)
    df.to_csv("output.csv")
    print(df)


def classifier(df, x, x_train, x_test, y_train, y_test):
    model = GaussianNB()
    model.fit(x_train, y_train)

    # Predict using the encoded full dataset
    y_pred_full = model.predict(x)
    df["predicted_value"] = y_pred_full

    return df


filename = "./employee.csv"
execute(filename)