import pandas as pd
import numpy as np
df=pd.read_excel('survey_data2.xlsx')
df.head()
Person | Age | Gender | State | Children | Salary | Opinion | Agree or Not | Unnamed: 8 | Unnamed: 9 | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Middle-aged | 2 | Texas | 2 | 63017 | Strongly agree | 1 | NaN | NaN |
1 | 2 | Middle-aged | 2 | Virginia | 3 | 100302 | Strongly disagree | 0 | NaN | NaN |
2 | 3 | Middle-aged | 2 | California | 0 | 144043 | Strongly agree | 1 | NaN | NaN |
3 | 4 | Young | 2 | California | 0 | 36025 | Agree | 1 | NaN | NaN |
4 | 5 | Middle-aged | 1 | Texas | 0 | 97543 | Neutral | 0 | NaN | NaN |
df=df[['Age','Gender', 'State', 'Children','Salary', 'Agree or Not']]
df.head()
Age | Gender | State | Children | Salary | Agree or Not | |
---|---|---|---|---|---|---|
0 | Middle-aged | 2 | Texas | 2 | 63017 | 1 |
1 | Middle-aged | 2 | Virginia | 3 | 100302 | 0 |
2 | Middle-aged | 2 | California | 0 | 144043 | 1 |
3 | Young | 2 | California | 0 | 36025 | 1 |
4 | Middle-aged | 1 | Texas | 0 | 97543 | 0 |
df = pd.concat([df,pd.get_dummies(df['Age'], prefix='_', drop_first=True)],axis=1)
df.drop(['Age'],axis=1, inplace=True)
df = pd.concat([df,pd.get_dummies(df['State'], prefix='State', drop_first=True)],axis=1)
df.drop(['State'],axis=1, inplace=True)
df['Gender']=df['Gender']-1
df.head()
Gender | Children | Salary | Agree or Not | __Middle-aged | __Young | State_California | State_Florida | State_Illinois | State_Michigan | State_Minnesota | State_New York | State_Ohio | State_Texas | State_Virginia | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 2 | 63017 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
1 | 1 | 3 | 100302 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
2 | 1 | 0 | 144043 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | 1 | 0 | 36025 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 0 | 0 | 97543 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
df1=df.copy()
df1['Gend_child']=df1['Gender']*df1['Children']
df1['log_Salary']=np.log(df1['Salary'])
df1['Salary_Sq'] = df1['Salary']**2
df1.head()
Gender | Children | Salary | Agree or Not | __Middle-aged | __Young | State_California | State_Florida | State_Illinois | State_Michigan | State_Minnesota | State_New York | State_Ohio | State_Texas | State_Virginia | Gend_child | log_Salary | Salary_Sq | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 2 | 63017 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 2 | 11.051160 | 3971142289 |
1 | 1 | 3 | 100302 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 3 | 11.515941 | 10060491204 |
2 | 1 | 0 | 144043 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 11.877867 | 20748385849 |
3 | 1 | 0 | 36025 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10.491968 | 1297800625 |
4 | 0 | 0 | 97543 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 11.488049 | 9514636849 |
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
df_copy = df.copy()
train_set = df_copy.sample(frac=0.80, random_state=0)
test_set = df_copy.drop(train_set.index)
train_set.head()
Gender | Children | Salary | Agree or Not | __Middle-aged | __Young | State_California | State_Florida | State_Illinois | State_Michigan | State_Minnesota | State_New York | State_Ohio | State_Texas | State_Virginia | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
132 | 0 | 0 | 97814 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
309 | 0 | 2 | 69817 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
334 | 0 | 1 | 48225 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
196 | 0 | 0 | 37929 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
246 | 1 | 2 | 100062 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
test_set.head()
Gender | Children | Salary | Agree or Not | __Middle-aged | __Young | State_California | State_Florida | State_Illinois | State_Michigan | State_Minnesota | State_New York | State_Ohio | State_Texas | State_Virginia | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
9 | 0 | 3 | 87457 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
25 | 1 | 2 | 148075 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
28 | 1 | 2 | 37963 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
31 | 1 | 2 | 49505 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
32 | 0 | 3 | 49723 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
test_set_labels = test_set.pop('Agree or Not')
train_set_labels = train_set.pop('Agree or Not')
model = LogisticRegression(solver = 'liblinear', random_state=0).fit(train_set,train_set_labels)
model.predict_proba(train_set)
array([[0.57614226, 0.42385774], [0.55455629, 0.44544371], [0.53776249, 0.46223751], [0.52972181, 0.47027819], [0.57786408, 0.42213592], [0.58483034, 0.41516966], [0.54040349, 0.45959651], [0.56540533, 0.43459467], [0.54585125, 0.45414875], [0.56552639, 0.43447361], [0.57141167, 0.42858833], [0.58043178, 0.41956822], [0.60341186, 0.39658814], [0.57132021, 0.42867979], [0.5893605 , 0.4106395 ], [0.56271841, 0.43728159], [0.54686083, 0.45313917], [0.58593019, 0.41406981], [0.53682003, 0.46317997], [0.58387605, 0.41612395], [0.55934182, 0.44065818], [0.56596354, 0.43403646], [0.60158629, 0.39841371], [0.53495542, 0.46504458], [0.56675815, 0.43324185], [0.57687778, 0.42312222], [0.58254418, 0.41745582], [0.53062622, 0.46937378], [0.57134019, 0.42865981], [0.58651175, 0.41348825], [0.59304194, 0.40695806], [0.53621291, 0.46378709], [0.58500863, 0.41499137], [0.56894906, 0.43105094], [0.53557681, 0.46442319], [0.58003509, 0.41996491], [0.61174906, 0.38825094], [0.53621604, 0.46378396], [0.52205014, 0.47794986], [0.56006569, 0.43993431], [0.53389272, 0.46610728], [0.55964656, 0.44035344], [0.56382465, 0.43617535], [0.57804779, 0.42195221], [0.55750088, 0.44249912], [0.59881966, 0.40118034], [0.52868818, 0.47131182], [0.54712211, 0.45287789], [0.54476892, 0.45523108], [0.57767345, 0.42232655], [0.60171942, 0.39828058], [0.54722553, 0.45277447], [0.53017912, 0.46982088], [0.62305308, 0.37694692], [0.57347701, 0.42652299], [0.54738492, 0.45261508], [0.57612003, 0.42387997], [0.53834046, 0.46165954], [0.57617521, 0.42382479], [0.55885524, 0.44114476], [0.54325247, 0.45674753], [0.56596277, 0.43403723], [0.57593456, 0.42406544], [0.53486408, 0.46513592], [0.56685755, 0.43314245], [0.55072519, 0.44927481], [0.58975613, 0.41024387], [0.554095 , 0.445905 ], [0.58681995, 0.41318005], [0.51626174, 0.48373826], [0.57926587, 0.42073413], [0.55168859, 0.44831141], [0.59888525, 0.40111475], [0.55290141, 0.44709859], [0.60811903, 0.39188097], [0.53694097, 0.46305903], [0.55069801, 0.44930199], [0.55876395, 0.44123605], [0.56508529, 0.43491471], [0.52922144, 0.47077856], [0.56297708, 0.43702292], [0.57488956, 0.42511044], [0.54378892, 0.45621108], [0.55693567, 0.44306433], [0.56747848, 0.43252152], [0.54154045, 0.45845955], [0.55130514, 0.44869486], [0.57066218, 0.42933782], [0.54748988, 0.45251012], [0.57547923, 0.42452077], [0.5740764 , 0.4259236 ], [0.56565207, 0.43434793], [0.5575094 , 0.4424906 ], [0.56540147, 0.43459853], [0.58493092, 0.41506908], [0.56023501, 0.43976499], [0.57156076, 0.42843924], [0.53703538, 0.46296462], [0.55730036, 0.44269964], [0.53313748, 0.46686252], [0.54884153, 0.45115847], [0.52919486, 0.47080514], [0.57626563, 0.42373437], [0.53432768, 0.46567232], [0.55857902, 0.44142098], [0.529979 , 0.470021 ], [0.55834378, 0.44165622], [0.58424733, 0.41575267], [0.58434947, 0.41565053], [0.56500276, 0.43499724], [0.61437241, 0.38562759], [0.52985314, 0.47014686], [0.55631376, 0.44368624], [0.53562286, 0.46437714], [0.58531487, 0.41468513], [0.55123061, 0.44876939], [0.58565531, 0.41434469], [0.59512758, 0.40487242], [0.52897437, 0.47102563], [0.58303481, 0.41696519], [0.57151004, 0.42848996], [0.51812204, 0.48187796], [0.56767334, 0.43232666], [0.53191955, 0.46808045], [0.56154741, 0.43845259], [0.58967716, 0.41032284], [0.55032527, 0.44967473], [0.54265828, 0.45734172], [0.53834748, 0.46165252], [0.52865456, 0.47134544], [0.55657556, 0.44342444], [0.57812509, 0.42187491], [0.56669882, 0.43330118], [0.57114189, 0.42885811], [0.58617228, 0.41382772], [0.56909375, 0.43090625], [0.51622412, 0.48377588], [0.55949342, 0.44050658], [0.57116649, 0.42883351], [0.55159623, 0.44840377], [0.56405771, 0.43594229], [0.53598736, 0.46401264], [0.56372818, 0.43627182], [0.5689329 , 0.4310671 ], [0.55088513, 0.44911487], [0.56637823, 0.43362177], [0.57965664, 0.42034336], [0.56069963, 0.43930037], [0.58574897, 0.41425103], [0.55643304, 0.44356696], [0.59786259, 0.40213741], [0.55926061, 0.44073939], [0.56555184, 0.43444816], [0.57654607, 0.42345393], [0.53371935, 0.46628065], [0.56564822, 0.43435178], [0.53564862, 0.46435138], [0.58691505, 0.41308495], [0.60021733, 0.39978267], [0.55765958, 0.44234042], [0.56534364, 0.43465636], [0.56073905, 0.43926095], [0.56396511, 0.43603489], [0.57015927, 0.42984073], [0.54005819, 0.45994181], [0.55208905, 0.44791095], [0.54357015, 0.45642985], [0.56221101, 0.43778899], [0.56670498, 0.43329502], [0.56277941, 0.43722059], [0.59667672, 0.40332328], [0.55375459, 0.44624541], [0.55446559, 0.44553441], [0.55686907, 0.44313093], [0.58065263, 0.41934737], [0.55884828, 0.44115172], [0.61133978, 0.38866022], [0.59358789, 0.40641211], [0.5531605 , 0.4468395 ], [0.57149621, 0.42850379], [0.57898439, 0.42101561], [0.5556909 , 0.4443091 ], [0.53603809, 0.46396191], [0.53618092, 0.46381908], [0.573444 , 0.426556 ], [0.55645163, 0.44354837], [0.55971229, 0.44028771], [0.56652543, 0.43347457], [0.5394852 , 0.4605148 ], [0.53954913, 0.46045087], [0.54976451, 0.45023549], [0.53808854, 0.46191146], [0.53521773, 0.46478227], [0.54921061, 0.45078939], [0.61112203, 0.38887797], [0.60021808, 0.39978192], [0.56328435, 0.43671565], [0.57931788, 0.42068212], [0.53222036, 0.46777964], [0.58049368, 0.41950632], [0.55038662, 0.44961338], [0.5728513 , 0.4271487 ], [0.58553041, 0.41446959], [0.57028924, 0.42971076], [0.58489663, 0.41510337], [0.55885292, 0.44114708], [0.5778235 , 0.4221765 ], [0.53107874, 0.46892126], [0.56566518, 0.43433482], [0.55453149, 0.44546851], [0.55242815, 0.44757185], [0.57273304, 0.42726696], [0.56405076, 0.43594924], [0.58325833, 0.41674167], [0.54079083, 0.45920917], [0.54083292, 0.45916708], [0.57548843, 0.42451157], [0.57305555, 0.42694445], [0.55553361, 0.44446639], [0.56252226, 0.43747774], [0.53878655, 0.46121345], [0.55922889, 0.44077111], [0.56392266, 0.43607734], [0.55293244, 0.44706756], [0.58555935, 0.41444065], [0.53775313, 0.46224687], [0.52454739, 0.47545261], [0.53418712, 0.46581288], [0.56699161, 0.43300839], [0.55295184, 0.44704816], [0.56117959, 0.43882041], [0.56182091, 0.43817909], [0.56126382, 0.43873618], [0.53440107, 0.46559893], [0.61407943, 0.38592057], [0.55055358, 0.44944642], [0.60224049, 0.39775951], [0.57169985, 0.42830015], [0.57175287, 0.42824713], [0.59396107, 0.40603893], [0.56441033, 0.43558967], [0.54664542, 0.45335458], [0.58470766, 0.41529234], [0.56903603, 0.43096397], [0.55719584, 0.44280416], [0.56240332, 0.43759668], [0.58250908, 0.41749092], [0.53288595, 0.46711405], [0.55796611, 0.44203389], [0.57236285, 0.42763715], [0.59832576, 0.40167424], [0.58335901, 0.41664099], [0.5208567 , 0.4791433 ], [0.56050869, 0.43949131], [0.59263365, 0.40736635], [0.56333375, 0.43666625], [0.55569013, 0.44430987], [0.57031077, 0.42968923], [0.56133724, 0.43866276], [0.53415589, 0.46584411], [0.53998959, 0.46001041], [0.58053648, 0.41946352], [0.54174613, 0.45825387], [0.56489864, 0.43510136], [0.54124047, 0.45875953], [0.52823306, 0.47176694], [0.51855139, 0.48144861], [0.5852067 , 0.4147933 ], [0.56616781, 0.43383219], [0.55158924, 0.44841076], [0.55997058, 0.44002942], [0.56743843, 0.43256157], [0.55027013, 0.44972987], [0.54627831, 0.45372169], [0.53971518, 0.46028482], [0.55599925, 0.44400075], [0.56199781, 0.43800219], [0.56926227, 0.43073773], [0.57883062, 0.42116938], [0.55575986, 0.44424014], [0.57091743, 0.42908257], [0.57950983, 0.42049017], [0.60408229, 0.39591771], [0.57150159, 0.42849841], [0.58090019, 0.41909981], [0.54765236, 0.45234764], [0.53287736, 0.46712264], [0.56185181, 0.43814819], [0.55186555, 0.44813445], [0.52986643, 0.47013357], [0.5320305 , 0.4679695 ], [0.5806213 , 0.4193787 ], [0.53944777, 0.46055223], [0.52353454, 0.47646546], [0.53446979, 0.46553021], [0.57496165, 0.42503835], [0.56127696, 0.43872304], [0.57958783, 0.42041217], [0.548697 , 0.451303 ], [0.54927898, 0.45072102], [0.5451549 , 0.4548451 ], [0.52996102, 0.47003898], [0.56775421, 0.43224579], [0.56787974, 0.43212026], [0.57696587, 0.42303413], [0.60379705, 0.39620295], [0.5654755 , 0.4345245 ], [0.57356605, 0.42643395], [0.53269535, 0.46730465], [0.53890274, 0.46109726], [0.5642321 , 0.4357679 ], [0.54833562, 0.45166438], [0.55837783, 0.44162217], [0.52792336, 0.47207664], [0.58321866, 0.41678134], [0.57472236, 0.42527764], [0.61566602, 0.38433398], [0.54172354, 0.45827646], [0.53458613, 0.46541387]])
model.predict(train_set)
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)
model.predict_proba(test_set)
array([[0.5681862 , 0.4318138 ], [0.6141248 , 0.3858752 ], [0.52974839, 0.47025161], [0.53876082, 0.46123918], [0.53893082, 0.46106918], [0.57041151, 0.42958849], [0.53753002, 0.46246998], [0.57213394, 0.42786606], [0.57271308, 0.42728692], [0.56414799, 0.43585201], [0.59842003, 0.40157997], [0.52680482, 0.47319518], [0.56234077, 0.43765923], [0.56277555, 0.43722445], [0.54249083, 0.45750917], [0.53113188, 0.46886812], [0.54684995, 0.45315005], [0.53778277, 0.46221723], [0.5677673 , 0.4322327 ], [0.57894537, 0.42105463], [0.61147622, 0.38852378], [0.5581356 , 0.4418644 ], [0.56175756, 0.43824244], [0.59836271, 0.40163729], [0.56545622, 0.43454378], [0.55440512, 0.44559488], [0.54287479, 0.45712521], [0.60223598, 0.39776402], [0.53849567, 0.46150433], [0.59607769, 0.40392231], [0.58637628, 0.41362372], [0.57209399, 0.42790601], [0.58146544, 0.41853456], [0.56576387, 0.43423613], [0.56182709, 0.43817291], [0.55546465, 0.44453535], [0.53668816, 0.46331184], [0.5327266 , 0.4672734 ], [0.56258481, 0.43741519], [0.5449339 , 0.4550661 ], [0.55125623, 0.44874377], [0.54852214, 0.45147786], [0.56771108, 0.43228892], [0.57456741, 0.42543259], [0.53460252, 0.46539748], [0.56018321, 0.43981679], [0.53688167, 0.46311833], [0.54161992, 0.45838008], [0.58375177, 0.41624823], [0.5543462 , 0.4456538 ], [0.52879922, 0.47120078], [0.55965971, 0.44034029], [0.5609601 , 0.4390399 ], [0.535263 , 0.464737 ], [0.59502852, 0.40497148], [0.55951353, 0.44048647], [0.56587257, 0.43412743], [0.55827878, 0.44172122], [0.54580146, 0.45419854], [0.56661097, 0.43338903], [0.53454865, 0.46545135], [0.53428785, 0.46571215], [0.56091373, 0.43908627], [0.57105272, 0.42894728], [0.555398 , 0.444602 ], [0.59007498, 0.40992502], [0.57462341, 0.42537659], [0.54071212, 0.45928788], [0.57373798, 0.42626202], [0.56697621, 0.43302379], [0.56142301, 0.43857699], [0.602596 , 0.397404 ], [0.56667031, 0.43332969], [0.55353978, 0.44646022], [0.56563511, 0.43436489], [0.56849185, 0.43150815], [0.54393061, 0.45606939], [0.5680299 , 0.4319701 ], [0.56263346, 0.43736654], [0.56620095, 0.43379905]])
model.predict(test_set)
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)
model.score(train_set,train_set_labels)
0.5768025078369906
model.score(test_set,test_set_labels)
0.5875
confusion_matrix(train_set_labels,model.predict(train_set))
array([[184, 0], [135, 0]], dtype=int64)
confusion_matrix(test_set_labels,model.predict(test_set))
array([[47, 0], [33, 0]], dtype=int64)
import matplotlib.pyplot as plt
cm = confusion_matrix(test_set_labels, model.predict(test_set))
fig, ax = plt.subplots(figsize=(8, 8))
ax.imshow(cm)
ax.grid(False)
ax.xaxis.set(ticks=(0, 1), ticklabels=('Predicted 0s', 'Predicted 1s'))
ax.yaxis.set(ticks=(0, 1), ticklabels=('Actual 0s', 'Actual 1s'))
ax.set_ylim(1.5, -0.5)
for i in range(2):
for j in range(2):
ax.text(j, i, cm[i, j], ha='center', va='center', color='red')
plt.show()
model = LogisticRegression(C=10.0, solver = 'liblinear', random_state=0).fit(train_set,train_set_labels)
model.score(train_set,train_set_labels)
0.5768025078369906
prob1 = model.predict_proba(train_set)
predicted=[0 if i>0.54 else 1 for i in prob1[:,0]]
predicted
[0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1]
sum(abs(train_set_labels-predicted))
149
len(train_set_labels)
319
error_percent =sum(abs(train_set_labels-predicted))/319
1-error_percent
0.5329153605015674
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_features = scaler.fit_transform(train_set)
kmeans = KMeans(init="random", n_clusters=5, n_init=10, max_iter=300, random_state=42)
kmeans.fit(scaled_features)
KMeans(init='random', n_clusters=5, random_state=42)
kmeans.inertia_
2974.411076839821
kmeans.cluster_centers_
array([[-7.11156596e-02, -1.43669774e-01, -1.73749959e-01, -9.12870929e-02, -1.18004662e-02, 3.50594733e+00, -3.33913548e-01, -3.22189740e-01, -3.78641223e-01, -2.91605922e-01, -3.39683110e-01, -3.56662979e-01, -3.89389970e-01, -3.16227766e-01], [-2.90384537e-01, 1.56194832e-01, 1.07580217e-01, 2.43432248e-01, -1.94930944e-01, -2.85229613e-01, -3.33913548e-01, 3.10376116e+00, -3.78641223e-01, -2.91605922e-01, -3.39683110e-01, -3.56662979e-01, -3.89389970e-01, -3.16227766e-01], [ 2.00891778e-01, -1.28259907e-01, 3.47203114e-01, 2.05717392e-01, -5.20496239e-01, -2.85229613e-01, -3.33913548e-01, -3.22189740e-01, 1.02486446e+00, -2.91605922e-01, -3.39683110e-01, -3.56662979e-01, 1.19350246e+00, -3.16227766e-01], [ 6.75354560e-02, 1.30370486e-01, -1.24356237e+00, -1.09544512e+00, 1.92124347e+00, -2.85229613e-01, 1.17435697e-01, -3.22189740e-01, -2.03760319e-02, 2.37238716e-02, -6.14116358e-02, 1.83033261e-02, -1.88880849e-01, 3.32307144e-01], [-5.79970088e-02, 1.30980991e-03, 3.67861863e-01, 3.32690739e-01, -5.20496239e-01, -2.85229613e-01, 2.57855462e-01, -3.22189740e-01, -3.78641223e-01, 2.59637273e-01, 3.41360558e-01, 3.22244268e-01, -3.89389970e-01, 1.47572957e-01]])
kmeans.n_iter_
8
kmeans.labels_[:]
array([4, 4, 3, 3, 4, 2, 3, 4, 4, 2, 4, 4, 2, 2, 4, 0, 0, 4, 3, 4, 4, 2, 2, 3, 4, 4, 4, 2, 2, 1, 2, 0, 4, 2, 3, 4, 2, 4, 3, 1, 3, 2, 2, 4, 4, 4, 3, 4, 3, 4, 1, 4, 3, 4, 2, 4, 2, 4, 4, 4, 3, 2, 2, 3, 4, 4, 0, 0, 4, 4, 2, 4, 4, 4, 4, 3, 2, 2, 4, 2, 1, 4, 3, 2, 4, 3, 2, 4, 0, 1, 2, 2, 2, 4, 4, 4, 4, 3, 4, 3, 0, 3, 1, 2, 4, 3, 4, 0, 1, 1, 2, 3, 1, 4, 2, 4, 4, 4, 3, 4, 2, 1, 2, 1, 2, 4, 2, 2, 4, 3, 4, 2, 1, 4, 4, 4, 3, 2, 1, 0, 2, 3, 2, 2, 4, 2, 4, 0, 4, 3, 2, 4, 2, 4, 3, 0, 4, 4, 4, 4, 1, 1, 4, 2, 3, 2, 1, 0, 2, 4, 4, 4, 4, 2, 4, 4, 4, 1, 4, 4, 4, 4, 3, 3, 1, 4, 4, 2, 3, 3, 4, 3, 3, 1, 0, 2, 0, 4, 3, 1, 2, 4, 2, 4, 4, 2, 2, 3, 2, 3, 0, 2, 4, 4, 3, 3, 1, 4, 4, 4, 3, 4, 4, 4, 4, 3, 3, 4, 4, 2, 1, 2, 4, 3, 4, 2, 4, 1, 4, 4, 4, 4, 2, 0, 1, 4, 4, 3, 1, 4, 2, 4, 3, 0, 4, 1, 4, 2, 4, 3, 3, 1, 1, 2, 0, 0, 3, 4, 4, 3, 4, 4, 4, 2, 3, 1, 2, 4, 2, 4, 4, 4, 4, 4, 4, 3, 0, 4, 4, 3, 3, 4, 3, 3, 3, 2, 2, 4, 0, 2, 4, 1, 4, 4, 4, 4, 2, 4, 3, 3, 2, 0, 4, 3, 0, 2, 2, 0, 4])
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=1)
model.fit(scaled_features,train_set_labels)
predicted = model.predict(scaled_features)
predicted
array([0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0], dtype=int64)
sum(abs(predicted-train_set_labels))
0
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(train_set, train_set_labels)
RandomForestClassifier()
yhat = model.predict(train_set)
sum(abs(yhat-train_set_labels))
0
df1.describe().transpose()
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
Gender | 399.0 | 5.864662e-01 | 4.930851e-01 | 0.000000e+00 | 0.000000e+00 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 |
Children | 399.0 | 1.313283e+00 | 1.022166e+00 | 0.000000e+00 | 0.000000e+00 | 2.000000e+00 | 2.000000e+00 | 3.000000e+00 |
Salary | 399.0 | 7.894247e+04 | 2.723687e+04 | 2.068700e+04 | 5.812850e+04 | 7.921200e+04 | 9.579650e+04 | 1.601340e+05 |
Agree or Not | 399.0 | 4.210526e-01 | 4.943478e-01 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 1.000000e+00 | 1.000000e+00 |
__Middle-aged | 399.0 | 5.463659e-01 | 4.984706e-01 | 0.000000e+00 | 0.000000e+00 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 |
__Young | 399.0 | 2.180451e-01 | 4.134366e-01 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 1.000000e+00 |
State_California | 399.0 | 8.270677e-02 | 2.757843e-01 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 1.000000e+00 |
State_Florida | 399.0 | 1.027569e-01 | 3.040223e-01 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 1.000000e+00 |
State_Illinois | 399.0 | 9.523810e-02 | 2.939121e-01 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 1.000000e+00 |
State_Michigan | 399.0 | 1.152882e-01 | 3.197704e-01 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 1.000000e+00 |
State_Minnesota | 399.0 | 7.769424e-02 | 2.680259e-01 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 1.000000e+00 |
State_New York | 399.0 | 1.152882e-01 | 3.197704e-01 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 1.000000e+00 |
State_Ohio | 399.0 | 1.102757e-01 | 3.136263e-01 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 1.000000e+00 |
State_Texas | 399.0 | 1.203008e-01 | 3.257213e-01 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 1.000000e+00 |
State_Virginia | 399.0 | 9.774436e-02 | 2.973415e-01 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 1.000000e+00 |
Gend_child | 399.0 | 8.020050e-01 | 1.009202e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 2.000000e+00 | 3.000000e+00 |
log_Salary | 399.0 | 1.121107e+01 | 3.760868e-01 | 9.937261e+00 | 1.097039e+01 | 1.127988e+01 | 1.146998e+01 | 1.198377e+01 |
Salary_Sq | 399.0 | 6.971901e+09 | 4.626720e+09 | 4.279520e+08 | 3.379095e+09 | 6.274541e+09 | 9.176971e+09 | 2.564290e+10 |
this is where we left off in lecture... additional examples included or we can go through them on another night
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(hidden_layer_sizes=(17,13,13,13),max_iter=1000)
mlp.fit(scaled_features,train_set_labels)
MLPClassifier(hidden_layer_sizes=(17, 13, 13, 13), max_iter=1000)
MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
beta_2=0.999, early_stopping=False, epsilon=1e-08,
hidden_layer_sizes=(17,13,13,13), learning_rate='constant',
learning_rate_init=0.001, max_iter=1000, momentum=0.9,
nesterovs_momentum=True, power_t=0.5, random_state=None,
shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
verbose=False, warm_start=False)
MLPClassifier(hidden_layer_sizes=(17, 13, 13, 13), max_iter=1000)
predictions = mlp.predict(train_set) #to predict with the test set you'd also have to scale it
predictions
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)
from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(train_set_labels,predictions)) #this is the same result we got with logistic regression
#experiment with settings to see if you can improve it
[[184 0] [135 0]]
print(classification_report(train_set_labels,predictions))
precision recall f1-score support 0 0.58 1.00 0.73 184 1 0.00 0.00 0.00 135 accuracy 0.58 319 macro avg 0.29 0.50 0.37 319 weighted avg 0.33 0.58 0.42 319
C:\Users\Betsy\Anaconda3\lib\site-packages\sklearn\metrics\_classification.py:1221: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
import re
txt = "The rain in Spain is mainly on the plain."
x = re.findall("ai", txt)
print(x)
['ai', 'ai', 'ai', 'ai']
x = re.split("\s", txt)
print(x)
['The', 'rain', 'in', 'Spain', 'is', 'mainly', 'on', 'the', 'plain.']
x = re.split("\s", txt, 1) #splits at only the first space
print(x)
['The', 'rain in Spain is mainly on the plain.']
x = re.sub("\s", "-", txt)
print(x)
The-rain-in-Spain-is-mainly-on-the-plain.
x = re.search(r"\bS\w+", txt) #looks for a word with uppercase S at the start, gives start and end position of word
print(x.span())
(12, 17)
print(x.string) #prints whole string where the word appeared
The rain in Spain is mainly on the plain.
print(x.group()) #prints just the word
Spain
x = re.findall("[mat]", txt)
print(x)
['a', 'a', 'm', 'a', 't', 'a']
x = re.findall("ain+", txt)
print(x)
['ain', 'ain', 'ain', 'ain']
#https://www.w3schools.com/python/python_regex.asp more code keys here