Algorithm used in experiment
1import pandas as pd
2import numpy as np
3import seaborn as sns
4import matplotlib.pyplot as plt
5from sklearn.model_selection import cross_val_score
6from sklearn.linear_model import LogisticRegression
7from sklearn.naive_bayes import GaussianNB
8from sklearn.neighbors import KNeighborsClassifier
9
10def data_summary(df):
11 # Summary statistics
12 summary_stats = df.describe()
13
14 # Count of missing values
15 missing_values = df.isnull().sum()
16
17 # Count of outliers
18 outliers = {}
19 for col in df.columns:
20 if df[col].dtype != 'object':
21 q1 = df[col].quantile(0.25)
22 q3 = df[col].quantile(0.75)
23 iqr = q3 - q1
24 lower_bound = q1 - 1.5 * iqr
25 upper_bound = q3 + 1.5 * iqr
26 num_outliers = len(df[(df[col] < lower_bound) | (df[col] > upper_bound)])
27 outliers[col] = num_outliers
28
29 # Other measures
30 measures = {}
31 for col in df.columns:
32 unique_values = df[col].nunique()
33 zero_values = len(df[df[col] == 0])
34 measures[col] = {'Unique Values': unique_values, 'Zero Values': zero_values}
35
36 combined_table = pd.DataFrame(pd.concat([summary_stats.T, pd.Series(missing_values, name='Missing'), pd.Series(outliers, name='Outliers')], axis=1))
37
38 return combined_table, measures
233a38f066926f7499af29848c036ead5c02dd79b306d76b7a5d6ae684df255a
File pathhttps://loglock.extropy.dev/algorithm/233a38f066