statistics for data science and ai
statistics for data science and ai.
- descriptive statistics : mean, median, measures of spread, variance, standard deviation
correlation is a measure of the relationship between two quantitative variables.
- positive correlation
- no correlation
- negative correlation
pearson correlation
if corr value is close to 1, then positive correlation if corr value is close to 0, then no correlation if corr value is close to -1, then negative correlation
import pandas as pd
from scipy import stats
from sklearn.feature_selection import r_regression
def compute_pearson_corr(filename):
df = pd.read_csv(filename)
print(f"df : {df}")
salary = df["Salary"]
bonus = df["Bonus"]
correlation_coefficient, p_value = stats.pearsonr(salary, bonus)
corr_eff = r_regression(df[["Salary"]], df["Bonus"])
print(f"corr_eff : {corr_eff}")
print(f"p_value : {p_value}")
print(f"correlation_coefficient : {correlation_coefficient}")
return
filename = "./employment_records.csv"
compute_pearson_corr(filename)
-
sampling
-
estimating unknown parameters
-
confidence interval
hypothesis testing
classification program :
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings("ignore")
def create_high_bonus_col(df):
df["high_bonus"] = df["Bonus"].apply(lambda x: 1 if x > 2000 else 0)
return df
def prep_cols(df):
x = df[["Gender", "Nationality", "Marital Status", "Job Title", "Salary"]].copy()
y = df["high_bonus"]
le = LabelEncoder()
for col in ["Gender", "Nationality", "Marital Status", "Job Title"]:
x[col] = le.fit_transform(x[col].astype(str))
return x, y
def get_train_test_split(x, y):
return train_test_split(x, y, test_size=0.2, random_state=42)
def execute(f):
df = pd.read_csv(f)
df = create_high_bonus_col(df)
x, y = prep_cols(df)
x_train, x_test, y_train, y_test = get_train_test_split(x, y)
df = classifier(df, x, x_train, x_test, y_train, y_test)
df.to_csv("output.csv")
print(df)
def classifier(df, x, x_train, x_test, y_train, y_test):
model = GaussianNB()
model.fit(x_train, y_train)
# Predict using the encoded full dataset
y_pred_full = model.predict(x)
df["predicted_value"] = y_pred_full
return df
filename = "./employee.csv"
execute(filename)