1
2#nlp
3import string
4import re #for regex
5import nltk
6from nltk.corpus import stopwords
7import spacy
8from nltk import pos_tag
9from nltk.stem.wordnet import WordNetLemmatizer
10from nltk.tokenize import word_tokenize
11# Tweet tokenizer does not split at apostophes which is what we want
12from nltk.tokenize import TweetTokenizer
13
14
15#FeatureEngineering
16from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer
17from sklearn.decomposition import TruncatedSVD
18from sklearn.base import BaseEstimator, ClassifierMixin
19from sklearn.utils.validation import check_X_y, check_is_fitted
20from sklearn.linear_model import LogisticRegression
21from sklearn import metrics
22from sklearn.metrics import log_loss
23from sklearn.model_selection import StratifiedKFold
24from sklearn.model_selection import train_test_split
25
26def cramers_corrected_stat(confusion_matrix):
27 """ calculate Cramers V statistic for categorial-categorial association.
28 uses correction from Bergsma and Wicher,
29 Journal of the Korean Statistical Society 42 (2013): 323-328
30 """
31 chi2 = ss.chi2_contingency(confusion_matrix)[0]
32 n = confusion_matrix.sum().sum()
33 phi2 = chi2/n
34 r,k = confusion_matrix.shape
35 phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
36 rcorr = r - ((r-1)**2)/(n-1)
37 kcorr = k - ((k-1)**2)/(n-1)
38 return np.sqrt(phi2corr / min( (kcorr-1), (rcorr-1)))
39