import pandas as pd
import numpy as np
import seaborn as sns


df=pd.read_excel('health_data_nulls.xlsx')
df.head()


df=df.dropna()
df.head()


from sklearn import linear_model
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt


y = df['Blood Pressure']
df=df[['Age','Income','Alcohol','Exercise','Smoke']]


X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.2)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(510, 5) (510,)
(128, 5) (128,)


lm=linear_model.LinearRegression()
model=lm.fit(X_train,y_train)
predictions=lm.predict(X_test)


predictions[0:5]

array([72.82906953, 49.18391924, 35.14077957, 30.72714997, 12.87067795])


plt.scatter(y_test,predictions)
plt.xlabel("True Values")
plt.ylabel("Predictions")
plt.show()


print("Score:", model.score(X_test, y_test))

Score: 0.9318359424946115


from sklearn.model_selection import KFold 
kf = KFold(n_splits=5) #5 splits matches the 80/20 train split 
kf.get_n_splits(df) # returns the number of splitting iterations in the cross-validator
print(kf) 
KFold(n_splits=5, random_state=None, shuffle=False)

KFold(n_splits=5, random_state=None, shuffle=False)

KFold(n_splits=5, random_state=None, shuffle=False)


X=df.to_numpy()
Y=y.to_numpy()
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, t_test = Y[train_index], Y[test_index]


from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn import metrics


scores = cross_val_score(model,X, Y, cv=5)
print(scores)

[0.94136405 0.92254527 0.93177242 0.91735091 0.9503647 ]


predictions = cross_val_predict(model, X, Y, cv=5)
plt.scatter(Y, predictions)

<matplotlib.collections.PathCollection at 0x1d624450f40>


accuracy = metrics.r2_score(Y,predictions)
accuracy

0.933838939927048


#LOOCV Leave One Out Cross Validation, this is equivalent to split equal to observations
#generally only need to use with small data sets
from sklearn.model_selection import LeaveOneOut 
loo = LeaveOneOut()
loo.get_n_splits(X)


for train_index, test_index in loo.split(X):
    #print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = Y[train_index], Y[test_index]
    #print(X_train, X_test, y_train, y_test)


#LOOCV is more popular in the classification setting


mba=pd.ExcelFile('ElecMartSales.xlsx')
df2=mba.parse('Data')
df2.head()


two_way=pd.crosstab(df2['Region'], df2['Gender'])
two_way


from scipy.stats import chi2_contingency
stat, p, dof, expected = chi2_contingency(two_way)


stat

5.172525311449733

p

0.15959121450450045


from scipy.stats import f_oneway
is_MidWest = df2['Region']=='MidWest'
data1 = df2[is_MidWest]['Total Cost'].to_numpy()
is_NE = df2['Region']=='NorthEast'
data2 = df2[is_NE]['Total Cost'].to_numpy()
is_South = df2['Region']=='South'
data3 = df2[is_South]['Total Cost'].to_numpy()
is_West = df2['Region']=='West'
data4 = df2[is_West]['Total Cost'].to_numpy()
stat, p = f_oneway(data1, data2, data3, data4)
print('stat=%.3f, p=%.3f' % (stat, p))

stat=1.964, p=0.119


np.random.seed(0) 
lam, size_1, size_2 = 5, 3, 1000 
samples_1 = np.random.poisson(lam, size_1)
samples_2 = np.random.poisson(lam, size_2)
answer_1 = abs(np.mean(samples_1) - lam)
answer_2 = abs(np.mean(samples_2) - lam) 
print("|Lambda - sample mean| with {} samples is {} and with {} samples is {}. ".format(size_1, answer_1, size_2, answer_2))

|Lambda - sample mean| with 3 samples is 1.666666666666667 and with 1000 samples is 0.05799999999999983.


plt.hist(samples_2)
plt.xlabel('samples_2 value')
plt.ylabel('count')
plt.title('np.random.poisson result\nlamda:5')
plt.show()


np.random.seed(99) 
mu, sigma, size_1, size_2 = 500, 100, 3, 1000 
samples_1 = np.random.normal(mu, sigma, size_1)
samples_2 = np.random.normal(mu, sigma, size_2)
answer_1 = abs(np.mean(samples_1) - mu)
answer_2 = abs(np.mean(samples_2) - mu) 
print("|mu - sample mean| with {} samples is {} and with {} samples is {}. ".format(size_1, answer_1, size_2, answer_2))

|mu - sample mean| with 3 samples is 73.2708278661861 and with 1000 samples is 5.668305231146178.


plt.hist(samples_2)
plt.xlabel('samples_2 value')
plt.ylabel('count')
plt.title('np.random.normal result\nmu:500, sigma:100')
plt.show()


#install wordcloud package 
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator


df3 = pd.read_csv("winemag-data-130k-v2.csv", index_col=0) 
#download from kaggle: https://www.kaggle.com/zynicide/wine-reviews/data


text=df3.description[0]


text

"Aromas include tropical fruit, broom, brimstone and dried herb. The palate isn't overly expressive, offering unripened apple, citrus and dried sage alongside brisk acidity."


wordcloud = WordCloud().generate(text)


plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()


wordcloud = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(text)
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()


text = " ".join(review for review in df3.description)
print ("There are {} words in the combination of all review.".format(len(text)))

There are 31661073 words in the combination of all review.


stopwords = set(STOPWORDS)
stopwords.update(["drink", "now", "wine", "flavor", "flavors"])


wordcloud = WordCloud(stopwords=stopwords, background_color="white").generate(text)


plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

	Person	Age	Income	Alcohol	Exercise	Smoke	Blood Pressure
0	1	61.0	268300.0	41.0	NaN	3.0	62
1	2	55.0	122200.0	51.0	7.0	56.0	53
2	3	53.0	82100.0	37.0	0.0	55.0	42
3	4	30.0	101400.0	41.0	20.0	61.0	48
4	5	64.0	181100.0	NaN	0.0	70.0	81

	Person	Age	Income	Alcohol	Exercise	Smoke	Blood Pressure
1	2	55.0	122200.0	51.0	7.0	56.0	53
2	3	53.0	82100.0	37.0	0.0	55.0	42
3	4	30.0	101400.0	41.0	20.0	61.0	48
8	9	59.0	233500.0	25.0	15.0	33.0	66
9	10	44.0	50400.0	64.0	0.0	85.0	54

	Date	Day	Time	Region	Card Type	Gender	Buy Category	Items Ordered	Total Cost	High Item	Unnamed: 10	Unnamed: 11
0	2016-03-06	Sun	Morning	West	ElecMart	Female	High	4	136.97	79.97	NaN	NaN
1	2016-03-06	Sun	Morning	West	Other	Female	Medium	1	25.55	25.55	NaN	NaN
2	2016-03-06	Sun	Afternoon	West	ElecMart	Female	Medium	5	113.95	90.47	NaN	NaN
3	2016-03-06	Sun	Afternoon	NorthEast	Other	Female	Low	1	6.82	6.82	NaN	NaN
4	2016-03-06	Sun	Afternoon	West	ElecMart	Male	Medium	4	147.32	83.21	NaN	NaN