import pandas as pd
import numpy as np
import io
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2_contingency
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error, r2_score, classification_report, precision_recall_fscore_support
from sklearn import linear_model
from sklearn.linear_model import LogisticRegressionCV, Lasso, LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
pd.set_option("display.max_columns", 100)
randomstate = 42
cv = 10
n_features = 5
def info_to_dataframe(df):
buffer = io.StringIO()
df.info(buf=buffer)
info_string = buffer.getvalue()
info_lines = info_string.splitlines()
data = []
for line in info_lines[5:-2]:
split_line = line.split(maxsplit=4)
if len(split_line) == 5:
non_null_count = split_line[2].replace(',', '').strip()
dtype = split_line[4]
null_count = len(df) - int(non_null_count)
else:
non_null_count = None
dtype = None
row_data = {
'Column': split_line[1],
'Non-Null Count': non_null_count,
'Null Count': null_count,
'Dtype': dtype
}
data.append(row_data)
info_df = pd.DataFrame(data)
return info_df.sort_values(by='Null Count', ascending=False).reset_index(drop=True)
def plot_correlation_matrix(df, column1, column2):
mydf = df[[column1, column2]]
correlation_matrix = mydf.corr()
plt.figure(figsize=(6, 4))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1, square=True, cbar_kws={"shrink": .8})
plt.title(f'Correlation Matrix between {column1} and {column2}', fontsize=14)
plt.xticks(ticks=[0.5, 1.5], labels=[column1, column2])
plt.yticks(ticks=[0.5, 1.5], labels=[column1, column2])
plt.show()
def plot_pie_chart(sizes, labels, colors, title):
plt.figure(figsize=(14, 8))
plt.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=140)
plt.axis('equal')
plt.legend(loc="lower right",
labels=[f'{category}: N={value}' for category, value in zip(
labels_vaccine_categ, sizes_vaccine_categ)])
plt.title(title)
plt.show()
Task: Can you predict whether people got H1N1 and seasonal flu vaccines using information they shared about their backgrounds, opinions, and health behaviors?¶
Loading the data¶
Training Features¶
These are the input variables that your model will use to predict the probability that people received H1N1 flu and seasonal flu vaccines. There are 35 feature columns in total, each a response to a survey question. These questions cover several different topics, such as whether people observed safe behavioral practices, their opinions about the diseases and the vaccines, and their demographics. Check out the problem description page for more information.
train_features_df = pd.read_csv('training_set_features.csv')
train_features_df['feat'] = 'Train'
train_features_df.head()
respondent_id | h1n1_concern | h1n1_knowledge | behavioral_antiviral_meds | behavioral_avoidance | behavioral_face_mask | behavioral_wash_hands | behavioral_large_gatherings | behavioral_outside_home | behavioral_touch_face | doctor_recc_h1n1 | doctor_recc_seasonal | chronic_med_condition | child_under_6_months | health_worker | health_insurance | opinion_h1n1_vacc_effective | opinion_h1n1_risk | opinion_h1n1_sick_from_vacc | opinion_seas_vacc_effective | opinion_seas_risk | opinion_seas_sick_from_vacc | age_group | education | race | sex | income_poverty | marital_status | rent_or_own | employment_status | hhs_geo_region | census_msa | household_adults | household_children | employment_industry | employment_occupation | feat | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 3.0 | 1.0 | 2.0 | 2.0 | 1.0 | 2.0 | 55 - 64 Years | < 12 Years | White | Female | Below Poverty | Not Married | Own | Not in Labor Force | oxchjgsf | Non-MSA | 0.0 | 0.0 | NaN | NaN | Train |
1 | 1 | 3.0 | 2.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 5.0 | 4.0 | 4.0 | 4.0 | 2.0 | 4.0 | 35 - 44 Years | 12 Years | White | Male | Below Poverty | Not Married | Rent | Employed | bhuqouqj | MSA, Not Principle City | 0.0 | 0.0 | pxcmvdjn | xgwztkwe | Train |
2 | 2 | 1.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | NaN | NaN | 1.0 | 0.0 | 0.0 | NaN | 3.0 | 1.0 | 1.0 | 4.0 | 1.0 | 2.0 | 18 - 34 Years | College Graduate | White | Male | <= $75,000, Above Poverty | Not Married | Own | Employed | qufhixun | MSA, Not Principle City | 2.0 | 0.0 | rucpziij | xtkaffoo | Train |
3 | 3 | 1.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | NaN | 3.0 | 3.0 | 5.0 | 5.0 | 4.0 | 1.0 | 65+ Years | 12 Years | White | Female | Below Poverty | Not Married | Rent | Not in Labor Force | lrircsnp | MSA, Principle City | 0.0 | 0.0 | NaN | NaN | Train |
4 | 4 | 2.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | NaN | 3.0 | 3.0 | 2.0 | 3.0 | 1.0 | 4.0 | 45 - 54 Years | Some College | White | Female | <= $75,000, Above Poverty | Married | Own | Employed | qufhixun | MSA, Not Principle City | 1.0 | 0.0 | wxleyezf | emcorrxb | Train |
test_features_df = pd.read_csv('test_set_features.csv')
test_features_df['feat'] = 'Test'
test_features_df.head()
respondent_id | h1n1_concern | h1n1_knowledge | behavioral_antiviral_meds | behavioral_avoidance | behavioral_face_mask | behavioral_wash_hands | behavioral_large_gatherings | behavioral_outside_home | behavioral_touch_face | doctor_recc_h1n1 | doctor_recc_seasonal | chronic_med_condition | child_under_6_months | health_worker | health_insurance | opinion_h1n1_vacc_effective | opinion_h1n1_risk | opinion_h1n1_sick_from_vacc | opinion_seas_vacc_effective | opinion_seas_risk | opinion_seas_sick_from_vacc | age_group | education | race | sex | income_poverty | marital_status | rent_or_own | employment_status | hhs_geo_region | census_msa | household_adults | household_children | employment_industry | employment_occupation | feat | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 26707 | 2.0 | 2.0 | 0.0 | 1.0 | 0.0 | 1.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 5.0 | 1.0 | 1.0 | 5.0 | 1.0 | 1.0 | 35 - 44 Years | College Graduate | Hispanic | Female | > $75,000 | Not Married | Rent | Employed | mlyzmhmf | MSA, Not Principle City | 1.0 | 0.0 | atmlpfrs | hfxkjkmi | Test |
1 | 26708 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 4.0 | 1.0 | 1.0 | 4.0 | 1.0 | 1.0 | 18 - 34 Years | 12 Years | White | Male | Below Poverty | Not Married | Rent | Employed | bhuqouqj | Non-MSA | 3.0 | 0.0 | atmlpfrs | xqwwgdyp | Test |
2 | 26709 | 2.0 | 2.0 | 0.0 | 0.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | NaN | 5.0 | 4.0 | 2.0 | 5.0 | 4.0 | 4.0 | 55 - 64 Years | College Graduate | White | Male | > $75,000 | Married | Own | Employed | lrircsnp | Non-MSA | 1.0 | 0.0 | nduyfdeo | pvmttkik | Test |
3 | 26710 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 4.0 | 2.0 | 2.0 | 4.0 | 4.0 | 2.0 | 65+ Years | 12 Years | White | Female | <= $75,000, Above Poverty | Married | Own | Not in Labor Force | lrircsnp | MSA, Not Principle City | 1.0 | 0.0 | NaN | NaN | Test |
4 | 26711 | 3.0 | 1.0 | 1.0 | 1.0 | 0.0 | 1.0 | 1.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 5.0 | 2.0 | 4.0 | 4.0 | 4.0 | 2.0 | 35 - 44 Years | 12 Years | Black | Female | <= $75,000, Above Poverty | Not Married | Own | Employed | lzgpxyit | Non-MSA | 0.0 | 1.0 | fcxhlnwr | mxkfnird | Test |
features_df = pd.concat([train_features_df, test_features_df])
features_df#.head()
respondent_id | h1n1_concern | h1n1_knowledge | behavioral_antiviral_meds | behavioral_avoidance | behavioral_face_mask | behavioral_wash_hands | behavioral_large_gatherings | behavioral_outside_home | behavioral_touch_face | doctor_recc_h1n1 | doctor_recc_seasonal | chronic_med_condition | child_under_6_months | health_worker | health_insurance | opinion_h1n1_vacc_effective | opinion_h1n1_risk | opinion_h1n1_sick_from_vacc | opinion_seas_vacc_effective | opinion_seas_risk | opinion_seas_sick_from_vacc | age_group | education | race | sex | income_poverty | marital_status | rent_or_own | employment_status | hhs_geo_region | census_msa | household_adults | household_children | employment_industry | employment_occupation | feat | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 3.0 | 1.0 | 2.0 | 2.0 | 1.0 | 2.0 | 55 - 64 Years | < 12 Years | White | Female | Below Poverty | Not Married | Own | Not in Labor Force | oxchjgsf | Non-MSA | 0.0 | 0.0 | NaN | NaN | Train |
1 | 1 | 3.0 | 2.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 5.0 | 4.0 | 4.0 | 4.0 | 2.0 | 4.0 | 35 - 44 Years | 12 Years | White | Male | Below Poverty | Not Married | Rent | Employed | bhuqouqj | MSA, Not Principle City | 0.0 | 0.0 | pxcmvdjn | xgwztkwe | Train |
2 | 2 | 1.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | NaN | NaN | 1.0 | 0.0 | 0.0 | NaN | 3.0 | 1.0 | 1.0 | 4.0 | 1.0 | 2.0 | 18 - 34 Years | College Graduate | White | Male | <= $75,000, Above Poverty | Not Married | Own | Employed | qufhixun | MSA, Not Principle City | 2.0 | 0.0 | rucpziij | xtkaffoo | Train |
3 | 3 | 1.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | NaN | 3.0 | 3.0 | 5.0 | 5.0 | 4.0 | 1.0 | 65+ Years | 12 Years | White | Female | Below Poverty | Not Married | Rent | Not in Labor Force | lrircsnp | MSA, Principle City | 0.0 | 0.0 | NaN | NaN | Train |
4 | 4 | 2.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | NaN | 3.0 | 3.0 | 2.0 | 3.0 | 1.0 | 4.0 | 45 - 54 Years | Some College | White | Female | <= $75,000, Above Poverty | Married | Own | Employed | qufhixun | MSA, Not Principle City | 1.0 | 0.0 | wxleyezf | emcorrxb | Train |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
26703 | 53410 | 1.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | NaN | 4.0 | 2.0 | 2.0 | 4.0 | 2.0 | 1.0 | 35 - 44 Years | NaN | White | Female | NaN | NaN | NaN | NaN | dqpwygqj | MSA, Principle City | 1.0 | 1.0 | NaN | NaN | Test |
26704 | 53411 | 3.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 1.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 4.0 | 1.0 | 1.0 | 5.0 | 2.0 | 2.0 | 18 - 34 Years | 12 Years | White | Male | Below Poverty | Married | Rent | Employed | qufhixun | Non-MSA | 1.0 | 3.0 | fcxhlnwr | vlluhbov | Test |
26705 | 53412 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 4.0 | 3.0 | 1.0 | 4.0 | 3.0 | 1.0 | 18 - 34 Years | Some College | White | Female | Below Poverty | Not Married | Rent | Not in Labor Force | qufhixun | MSA, Not Principle City | 1.0 | 0.0 | NaN | NaN | Test |
26706 | 53413 | 3.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | NaN | 2.0 | 3.0 | 4.0 | 4.0 | 3.0 | 2.0 | 55 - 64 Years | Some College | White | Female | <= $75,000, Above Poverty | Married | Own | Not in Labor Force | bhuqouqj | MSA, Not Principle City | 1.0 | 0.0 | NaN | NaN | Test |
26707 | 53414 | 2.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 5.0 | 1.0 | 2.0 | 5.0 | 4.0 | 2.0 | 45 - 54 Years | College Graduate | White | Female | NaN | Not Married | Rent | Employed | lrircsnp | MSA, Principle City | 0.0 | 0.0 | NaN | xtkaffoo | Test |
53415 rows × 37 columns
features_df.columns
Index(['respondent_id', 'h1n1_concern', 'h1n1_knowledge', 'behavioral_antiviral_meds', 'behavioral_avoidance', 'behavioral_face_mask', 'behavioral_wash_hands', 'behavioral_large_gatherings', 'behavioral_outside_home', 'behavioral_touch_face', 'doctor_recc_h1n1', 'doctor_recc_seasonal', 'chronic_med_condition', 'child_under_6_months', 'health_worker', 'health_insurance', 'opinion_h1n1_vacc_effective', 'opinion_h1n1_risk', 'opinion_h1n1_sick_from_vacc', 'opinion_seas_vacc_effective', 'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'age_group', 'education', 'race', 'sex', 'income_poverty', 'marital_status', 'rent_or_own', 'employment_status', 'hhs_geo_region', 'census_msa', 'household_adults', 'household_children', 'employment_industry', 'employment_occupation', 'feat'], dtype='object')
features_df.describe()
respondent_id | h1n1_concern | h1n1_knowledge | behavioral_antiviral_meds | behavioral_avoidance | behavioral_face_mask | behavioral_wash_hands | behavioral_large_gatherings | behavioral_outside_home | behavioral_touch_face | doctor_recc_h1n1 | doctor_recc_seasonal | chronic_med_condition | child_under_6_months | health_worker | health_insurance | opinion_h1n1_vacc_effective | opinion_h1n1_risk | opinion_h1n1_sick_from_vacc | opinion_seas_vacc_effective | opinion_seas_risk | opinion_seas_sick_from_vacc | household_adults | household_children | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 53415.000000 | 53238.000000 | 53177.000000 | 53265.000000 | 52994.000000 | 53377.000000 | 53333.000000 | 53256.000000 | 53251.000000 | 53159.000000 | 49095.000000 | 49095.000000 | 51512.000000 | 51782.000000 | 51822.000000 | 28913.000000 | 52626.000000 | 52647.000000 | 52645.000000 | 52501.000000 | 52402.000000 | 52357.000000 | 52941.000000 | 52941.000000 |
mean | 26707.000000 | 1.620816 | 1.264287 | 0.049244 | 0.727705 | 0.069131 | 0.825849 | 0.355077 | 0.337271 | 0.680506 | 0.221489 | 0.331643 | 0.282148 | 0.084450 | 0.111709 | 0.883824 | 3.847623 | 2.334701 | 2.359141 | 4.025409 | 2.713923 | 2.130756 | 0.890406 | 0.539166 |
std | 15419.726651 | 0.906534 | 0.616881 | 0.216380 | 0.445145 | 0.253679 | 0.379243 | 0.478541 | 0.472783 | 0.466285 | 0.415253 | 0.470808 | 0.450049 | 0.278064 | 0.315012 | 0.320442 | 1.007498 | 1.280608 | 1.361078 | 1.084875 | 1.380553 | 1.336077 | 0.753836 | 0.931626 |
min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 |
25% | 13353.500000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 3.000000 | 1.000000 | 1.000000 | 4.000000 | 2.000000 | 1.000000 | 0.000000 | 0.000000 |
50% | 26707.000000 | 2.000000 | 1.000000 | 0.000000 | 1.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 4.000000 | 2.000000 | 2.000000 | 4.000000 | 2.000000 | 2.000000 | 1.000000 | 0.000000 |
75% | 40060.500000 | 2.000000 | 2.000000 | 0.000000 | 1.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 1.000000 | 5.000000 | 4.000000 | 4.000000 | 5.000000 | 4.000000 | 4.000000 | 1.000000 | 1.000000 |
max | 53414.000000 | 3.000000 | 2.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 5.000000 | 5.000000 | 5.000000 | 5.000000 | 5.000000 | 5.000000 | 3.000000 | 3.000000 |
features_df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 53415 entries, 0 to 26707 Data columns (total 37 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 respondent_id 53415 non-null int64 1 h1n1_concern 53238 non-null float64 2 h1n1_knowledge 53177 non-null float64 3 behavioral_antiviral_meds 53265 non-null float64 4 behavioral_avoidance 52994 non-null float64 5 behavioral_face_mask 53377 non-null float64 6 behavioral_wash_hands 53333 non-null float64 7 behavioral_large_gatherings 53256 non-null float64 8 behavioral_outside_home 53251 non-null float64 9 behavioral_touch_face 53159 non-null float64 10 doctor_recc_h1n1 49095 non-null float64 11 doctor_recc_seasonal 49095 non-null float64 12 chronic_med_condition 51512 non-null float64 13 child_under_6_months 51782 non-null float64 14 health_worker 51822 non-null float64 15 health_insurance 28913 non-null float64 16 opinion_h1n1_vacc_effective 52626 non-null float64 17 opinion_h1n1_risk 52647 non-null float64 18 opinion_h1n1_sick_from_vacc 52645 non-null float64 19 opinion_seas_vacc_effective 52501 non-null float64 20 opinion_seas_risk 52402 non-null float64 21 opinion_seas_sick_from_vacc 52357 non-null float64 22 age_group 53415 non-null object 23 education 50601 non-null object 24 race 53415 non-null object 25 sex 53415 non-null object 26 income_poverty 44495 non-null object 27 marital_status 50565 non-null object 28 rent_or_own 49337 non-null object 29 employment_status 50481 non-null object 30 hhs_geo_region 53415 non-null object 31 census_msa 53415 non-null object 32 household_adults 52941 non-null float64 33 household_children 52941 non-null float64 34 employment_industry 26810 non-null object 35 employment_occupation 26519 non-null object 36 feat 53415 non-null object dtypes: float64(23), int64(1), object(13) memory usage: 15.5+ MB
Training Labels¶
These are the labels corresponding to the observations in the training features. There are two target variables: h1n1_vaccine and seasonal_vaccine. Both are binary variables, with 1 indicating that a person received the respective flu vaccine and 0 indicating that a person did not receive the respective flu vaccine. Note that this is what is known as a "multilabel" modeling task
labels_df = pd.read_csv('training_set_labels.csv')
labels_df[['h1n1_vaccine','seasonal_vaccine']] = labels_df[
['h1n1_vaccine','seasonal_vaccine']].astype('category')
labels_df.head()
respondent_id | h1n1_vaccine | seasonal_vaccine | |
---|---|---|---|
0 | 0 | 0 | 0 |
1 | 1 | 0 | 1 |
2 | 2 | 0 | 0 |
3 | 3 | 0 | 1 |
4 | 4 | 0 | 0 |
labels_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 26707 entries, 0 to 26706 Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 respondent_id 26707 non-null int64 1 h1n1_vaccine 26707 non-null category 2 seasonal_vaccine 26707 non-null category dtypes: category(2), int64(1) memory usage: 261.2 KB
labels_df['VaccineCategory'] = np.where(
((labels_df['h1n1_vaccine']==1)&(labels_df['seasonal_vaccine']==1)), "Both Vaccines",
np.where(
((labels_df['h1n1_vaccine']==1)&(labels_df['seasonal_vaccine']==0)), "Seasonal Only",
np.where(
((labels_df['h1n1_vaccine']==0)&(labels_df['seasonal_vaccine']==1)), "H1N1 Only",
np.where(
((labels_df['h1n1_vaccine']==0)&(labels_df['seasonal_vaccine']==0)), "No Vaccine",
"Unknown"
)
)
)
)
df_vaccine_categ = labels_df.groupby('VaccineCategory')['respondent_id'].nunique().reset_index(
name='N').sort_values(by="N").reset_index(drop=True)
labels_vaccine_categ = df_vaccine_categ['VaccineCategory'].unique().tolist()
sizes_vaccine_categ = df_vaccine_categ['N'].tolist()
colors_vaccine_categ = ['#ADD8E6','#e57373','#81c784','#FFFF99']
plot_pie_chart(
sizes_vaccine_categ,
labels=labels_vaccine_categ,
colors=colors_vaccine_categ,
title = 'Respondent Distro by VaccineCategory'
)
h1n1_counts = labels_df.groupby('h1n1_vaccine')['respondent_id'].nunique()
seasonal_counts = labels_df.groupby('seasonal_vaccine')['respondent_id'].nunique()
h1n1_percent = (h1n1_counts / h1n1_counts.sum()) * 100
seasonal_percent = (seasonal_counts / seasonal_counts.sum()) * 100
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
h1n1_counts.plot(kind='barh', ax=axes[0], color='skyblue')
for idx, value in enumerate(h1n1_counts):
axes[0].text(value - 20, idx, f'{value} ({h1n1_percent[idx]:.0f}%)', va='center', ha='right', color='white')
axes[0].set_title('H1N1 Vaccine Counts')
axes[0].set_xlabel('')
axes[0].set_ylabel('')
axes[0].tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
seasonal_counts.plot(kind='barh', ax=axes[1], color='salmon')
for idx, value in enumerate(seasonal_counts):
axes[1].text(value - 20, idx, f'{value} ({seasonal_percent[idx]:.0f}%)', va='center', ha='right', color='white')
axes[1].set_title('Seasonal Vaccine Counts')
axes[1].set_xlabel('')
axes[1].set_ylabel('')
axes[1].tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
plt.tight_layout()
plt.show()
We observe above that the H1N1 Vaccine respondents data is highly imbalanced.¶
Independence of multilabel variables¶
To check if the two multilabel responses (h1n1_vaccine and seasonal_vaccine) are independent, you can use the Chi-Square test of independence. This test helps determine if there is a significant association between two categorical variables.
Steps: Create a contingency table from the two columns. Perform the Chi-Square test of independence.
# Create a contingency table
contingency_table = pd.crosstab(labels_df['h1n1_vaccine'], labels_df['seasonal_vaccine'])
# Perform Chi-Square test of independence
chi2, p, dof, expected = chi2_contingency(contingency_table)
# Print the results
print(f"Chi-Square Statistic: {chi2}")
print(f"P-value: {p}")
print(f"Degrees of Freedom: {dof}")
print("Expected Frequencies Table:")
print(expected)
# Interpretation
alpha = 0.05
if p < alpha:
print("Reject the null hypothesis. The two variables are dependent.")
else:
print("Fail to reject the null hypothesis. The two variables are independent.")
Chi-Square Statistic: 3796.8641900765715 P-value: 0.0 Degrees of Freedom: 1 Expected Frequencies Table: [[11239.86130977 9793.13869023] [ 3032.13869023 2641.86130977]] Reject the null hypothesis. The two variables are dependent.
This further proves dependency in such a way that most of the people who received H1N1 vaccine also received the Seasonal Vaccine¶
Data Consolidation¶
** Check column type.
** Ensure 0 and 1 responses that are True/False are represented as should
** Ensure all numerical variables are represented as numericals.
** Use "No Response" as the variable for missing values on the columns of type object.
object_cols = []
for each in features_df.columns.tolist():
df_each = features_df[each]
type_col = df_each.dtype
try:
max_val = df_each.max()
min_val = df_each.min()
if min_val == 0 and max_val == 1:
type_col = object
object_cols.append(each)
except:
pass
null_object_cols = []
for each in features_df.columns.tolist():
df_each = features_df[each]
type_col = df_each.dtype
if type_col == object and df_each.isna().any() and each not in object_cols:
null_object_cols.append(each)
features_df[object_cols] = features_df[object_cols].apply(lambda x: x.map({1: 'True', 0: 'False', np.nan: 'No response'})).copy()
features_df[null_object_cols] = features_df[null_object_cols].fillna("No response").copy()
features_df.head()
respondent_id | h1n1_concern | h1n1_knowledge | behavioral_antiviral_meds | behavioral_avoidance | behavioral_face_mask | behavioral_wash_hands | behavioral_large_gatherings | behavioral_outside_home | behavioral_touch_face | doctor_recc_h1n1 | doctor_recc_seasonal | chronic_med_condition | child_under_6_months | health_worker | health_insurance | opinion_h1n1_vacc_effective | opinion_h1n1_risk | opinion_h1n1_sick_from_vacc | opinion_seas_vacc_effective | opinion_seas_risk | opinion_seas_sick_from_vacc | age_group | education | race | sex | income_poverty | marital_status | rent_or_own | employment_status | hhs_geo_region | census_msa | household_adults | household_children | employment_industry | employment_occupation | feat | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 1.0 | 0.0 | False | False | False | False | False | True | True | False | False | False | False | False | True | 3.0 | 1.0 | 2.0 | 2.0 | 1.0 | 2.0 | 55 - 64 Years | < 12 Years | White | Female | Below Poverty | Not Married | Own | Not in Labor Force | oxchjgsf | Non-MSA | 0.0 | 0.0 | No response | No response | Train |
1 | 1 | 3.0 | 2.0 | False | True | False | True | False | True | True | False | False | False | False | False | True | 5.0 | 4.0 | 4.0 | 4.0 | 2.0 | 4.0 | 35 - 44 Years | 12 Years | White | Male | Below Poverty | Not Married | Rent | Employed | bhuqouqj | MSA, Not Principle City | 0.0 | 0.0 | pxcmvdjn | xgwztkwe | Train |
2 | 2 | 1.0 | 1.0 | False | True | False | False | False | False | False | No response | No response | True | False | False | No response | 3.0 | 1.0 | 1.0 | 4.0 | 1.0 | 2.0 | 18 - 34 Years | College Graduate | White | Male | <= $75,000, Above Poverty | Not Married | Own | Employed | qufhixun | MSA, Not Principle City | 2.0 | 0.0 | rucpziij | xtkaffoo | Train |
3 | 3 | 1.0 | 1.0 | False | True | False | True | True | False | False | False | True | True | False | False | No response | 3.0 | 3.0 | 5.0 | 5.0 | 4.0 | 1.0 | 65+ Years | 12 Years | White | Female | Below Poverty | Not Married | Rent | Not in Labor Force | lrircsnp | MSA, Principle City | 0.0 | 0.0 | No response | No response | Train |
4 | 4 | 2.0 | 1.0 | False | True | False | True | True | False | True | False | False | False | False | False | No response | 3.0 | 3.0 | 2.0 | 3.0 | 1.0 | 4.0 | 45 - 54 Years | Some College | White | Female | <= $75,000, Above Poverty | Married | Own | Employed | qufhixun | MSA, Not Principle City | 1.0 | 0.0 | wxleyezf | emcorrxb | Train |
As observed from above, in order to take care of the blanks, for the case of categorical variables, they have been replaced with "No Response". As for 0's and 1's, they have been replaced by True and False values.
The last bit would to replace the null values for integers and floats with median values for the entire columns.
int_object_cols = []
int_object_cols_medians = []
for each in features_df.columns.tolist():
df_each = features_df[each]
type_col = df_each.dtype
if type_col == float or type_col == int and each not in ['respondent_id']:
if df_each.isna().any():
int_object_cols.append(each)
median_col = df_each.median()
int_object_cols_medians.append(median_col)
features_df_fin = features_df.copy()
for column, median in zip(int_object_cols, int_object_cols_medians):
features_df_fin[column].fillna(median, inplace=True)
features_df_fin.head()
respondent_id | h1n1_concern | h1n1_knowledge | behavioral_antiviral_meds | behavioral_avoidance | behavioral_face_mask | behavioral_wash_hands | behavioral_large_gatherings | behavioral_outside_home | behavioral_touch_face | doctor_recc_h1n1 | doctor_recc_seasonal | chronic_med_condition | child_under_6_months | health_worker | health_insurance | opinion_h1n1_vacc_effective | opinion_h1n1_risk | opinion_h1n1_sick_from_vacc | opinion_seas_vacc_effective | opinion_seas_risk | opinion_seas_sick_from_vacc | age_group | education | race | sex | income_poverty | marital_status | rent_or_own | employment_status | hhs_geo_region | census_msa | household_adults | household_children | employment_industry | employment_occupation | feat | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 1.0 | 0.0 | False | False | False | False | False | True | True | False | False | False | False | False | True | 3.0 | 1.0 | 2.0 | 2.0 | 1.0 | 2.0 | 55 - 64 Years | < 12 Years | White | Female | Below Poverty | Not Married | Own | Not in Labor Force | oxchjgsf | Non-MSA | 0.0 | 0.0 | No response | No response | Train |
1 | 1 | 3.0 | 2.0 | False | True | False | True | False | True | True | False | False | False | False | False | True | 5.0 | 4.0 | 4.0 | 4.0 | 2.0 | 4.0 | 35 - 44 Years | 12 Years | White | Male | Below Poverty | Not Married | Rent | Employed | bhuqouqj | MSA, Not Principle City | 0.0 | 0.0 | pxcmvdjn | xgwztkwe | Train |
2 | 2 | 1.0 | 1.0 | False | True | False | False | False | False | False | No response | No response | True | False | False | No response | 3.0 | 1.0 | 1.0 | 4.0 | 1.0 | 2.0 | 18 - 34 Years | College Graduate | White | Male | <= $75,000, Above Poverty | Not Married | Own | Employed | qufhixun | MSA, Not Principle City | 2.0 | 0.0 | rucpziij | xtkaffoo | Train |
3 | 3 | 1.0 | 1.0 | False | True | False | True | True | False | False | False | True | True | False | False | No response | 3.0 | 3.0 | 5.0 | 5.0 | 4.0 | 1.0 | 65+ Years | 12 Years | White | Female | Below Poverty | Not Married | Rent | Not in Labor Force | lrircsnp | MSA, Principle City | 0.0 | 0.0 | No response | No response | Train |
4 | 4 | 2.0 | 1.0 | False | True | False | True | True | False | True | False | False | False | False | False | No response | 3.0 | 3.0 | 2.0 | 3.0 | 1.0 | 4.0 | 45 - 54 Years | Some College | White | Female | <= $75,000, Above Poverty | Married | Own | Employed | qufhixun | MSA, Not Principle City | 1.0 | 0.0 | wxleyezf | emcorrxb | Train |
features_df_fin.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 53415 entries, 0 to 26707 Data columns (total 37 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 respondent_id 53415 non-null int64 1 h1n1_concern 53415 non-null float64 2 h1n1_knowledge 53415 non-null float64 3 behavioral_antiviral_meds 53415 non-null object 4 behavioral_avoidance 53415 non-null object 5 behavioral_face_mask 53415 non-null object 6 behavioral_wash_hands 53415 non-null object 7 behavioral_large_gatherings 53415 non-null object 8 behavioral_outside_home 53415 non-null object 9 behavioral_touch_face 53415 non-null object 10 doctor_recc_h1n1 53415 non-null object 11 doctor_recc_seasonal 53415 non-null object 12 chronic_med_condition 53415 non-null object 13 child_under_6_months 53415 non-null object 14 health_worker 53415 non-null object 15 health_insurance 53415 non-null object 16 opinion_h1n1_vacc_effective 53415 non-null float64 17 opinion_h1n1_risk 53415 non-null float64 18 opinion_h1n1_sick_from_vacc 53415 non-null float64 19 opinion_seas_vacc_effective 53415 non-null float64 20 opinion_seas_risk 53415 non-null float64 21 opinion_seas_sick_from_vacc 53415 non-null float64 22 age_group 53415 non-null object 23 education 53415 non-null object 24 race 53415 non-null object 25 sex 53415 non-null object 26 income_poverty 53415 non-null object 27 marital_status 53415 non-null object 28 rent_or_own 53415 non-null object 29 employment_status 53415 non-null object 30 hhs_geo_region 53415 non-null object 31 census_msa 53415 non-null object 32 household_adults 53415 non-null float64 33 household_children 53415 non-null float64 34 employment_industry 53415 non-null object 35 employment_occupation 53415 non-null object 36 feat 53415 non-null object dtypes: float64(10), int64(1), object(26) memory usage: 15.5+ MB
From the above, we can see that now all the columns have the same number of records, therefore, NO NULL values.
As per the task, we are asked to predict whether people got H1N1 and seasonal flu vaccines using information they shared about their backgrounds, opinions, and health behaviors?
Again, H1N1 AND seasonal flu, so, we would be moving from a multilabel model to a single label model, whereby the labels would only focus on whether the respondent received both vaccines or NOT.
labels_df['Y'] = np.where(
labels_df['VaccineCategory'] == "Both Vaccines", 1, 0
)
labels_df['One_vaccine_only'] = np.where(
labels_df['VaccineCategory'].isin(["Seasonal Only","H1N1 Only"]), True, False
)
labels_df.head()
respondent_id | h1n1_vaccine | seasonal_vaccine | VaccineCategory | Y | One_vaccine_only | |
---|---|---|---|---|---|---|
0 | 0 | 0 | 0 | No Vaccine | 0 | False |
1 | 1 | 0 | 1 | H1N1 Only | 0 | True |
2 | 2 | 0 | 0 | No Vaccine | 0 | False |
3 | 3 | 0 | 1 | H1N1 Only | 0 | True |
4 | 4 | 0 | 0 | No Vaccine | 0 | False |
df_vaccine_categ2 = labels_df.groupby('Y')[
'respondent_id'].nunique().reset_index(name='N').sort_values(by="N").reset_index(drop=True)
df_vaccine_categ2
Y | N | |
---|---|---|
0 | 1 | 4697 |
1 | 0 | 22010 |
Final DF with labels and features joined.¶
final_df = pd.merge(
features_df_fin,
labels_df[['respondent_id','Y']],
on=['respondent_id'],
how='left'
)
final_df['Y'] = final_df['Y'].astype(object)
final_df = final_df.drop(columns=['respondent_id'])
final_df.head()
h1n1_concern | h1n1_knowledge | behavioral_antiviral_meds | behavioral_avoidance | behavioral_face_mask | behavioral_wash_hands | behavioral_large_gatherings | behavioral_outside_home | behavioral_touch_face | doctor_recc_h1n1 | doctor_recc_seasonal | chronic_med_condition | child_under_6_months | health_worker | health_insurance | opinion_h1n1_vacc_effective | opinion_h1n1_risk | opinion_h1n1_sick_from_vacc | opinion_seas_vacc_effective | opinion_seas_risk | opinion_seas_sick_from_vacc | age_group | education | race | sex | income_poverty | marital_status | rent_or_own | employment_status | hhs_geo_region | census_msa | household_adults | household_children | employment_industry | employment_occupation | feat | Y | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1.0 | 0.0 | False | False | False | False | False | True | True | False | False | False | False | False | True | 3.0 | 1.0 | 2.0 | 2.0 | 1.0 | 2.0 | 55 - 64 Years | < 12 Years | White | Female | Below Poverty | Not Married | Own | Not in Labor Force | oxchjgsf | Non-MSA | 0.0 | 0.0 | No response | No response | Train | 0.0 |
1 | 3.0 | 2.0 | False | True | False | True | False | True | True | False | False | False | False | False | True | 5.0 | 4.0 | 4.0 | 4.0 | 2.0 | 4.0 | 35 - 44 Years | 12 Years | White | Male | Below Poverty | Not Married | Rent | Employed | bhuqouqj | MSA, Not Principle City | 0.0 | 0.0 | pxcmvdjn | xgwztkwe | Train | 0.0 |
2 | 1.0 | 1.0 | False | True | False | False | False | False | False | No response | No response | True | False | False | No response | 3.0 | 1.0 | 1.0 | 4.0 | 1.0 | 2.0 | 18 - 34 Years | College Graduate | White | Male | <= $75,000, Above Poverty | Not Married | Own | Employed | qufhixun | MSA, Not Principle City | 2.0 | 0.0 | rucpziij | xtkaffoo | Train | 0.0 |
3 | 1.0 | 1.0 | False | True | False | True | True | False | False | False | True | True | False | False | No response | 3.0 | 3.0 | 5.0 | 5.0 | 4.0 | 1.0 | 65+ Years | 12 Years | White | Female | Below Poverty | Not Married | Rent | Not in Labor Force | lrircsnp | MSA, Principle City | 0.0 | 0.0 | No response | No response | Train | 0.0 |
4 | 2.0 | 1.0 | False | True | False | True | True | False | True | False | False | False | False | False | No response | 3.0 | 3.0 | 2.0 | 3.0 | 1.0 | 4.0 | 45 - 54 Years | Some College | White | Female | <= $75,000, Above Poverty | Married | Own | Employed | qufhixun | MSA, Not Principle City | 1.0 | 0.0 | wxleyezf | emcorrxb | Train | 0.0 |
final_df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 53415 entries, 0 to 53414 Data columns (total 37 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 h1n1_concern 53415 non-null float64 1 h1n1_knowledge 53415 non-null float64 2 behavioral_antiviral_meds 53415 non-null object 3 behavioral_avoidance 53415 non-null object 4 behavioral_face_mask 53415 non-null object 5 behavioral_wash_hands 53415 non-null object 6 behavioral_large_gatherings 53415 non-null object 7 behavioral_outside_home 53415 non-null object 8 behavioral_touch_face 53415 non-null object 9 doctor_recc_h1n1 53415 non-null object 10 doctor_recc_seasonal 53415 non-null object 11 chronic_med_condition 53415 non-null object 12 child_under_6_months 53415 non-null object 13 health_worker 53415 non-null object 14 health_insurance 53415 non-null object 15 opinion_h1n1_vacc_effective 53415 non-null float64 16 opinion_h1n1_risk 53415 non-null float64 17 opinion_h1n1_sick_from_vacc 53415 non-null float64 18 opinion_seas_vacc_effective 53415 non-null float64 19 opinion_seas_risk 53415 non-null float64 20 opinion_seas_sick_from_vacc 53415 non-null float64 21 age_group 53415 non-null object 22 education 53415 non-null object 23 race 53415 non-null object 24 sex 53415 non-null object 25 income_poverty 53415 non-null object 26 marital_status 53415 non-null object 27 rent_or_own 53415 non-null object 28 employment_status 53415 non-null object 29 hhs_geo_region 53415 non-null object 30 census_msa 53415 non-null object 31 household_adults 53415 non-null float64 32 household_children 53415 non-null float64 33 employment_industry 53415 non-null object 34 employment_occupation 53415 non-null object 35 feat 53415 non-null object 36 Y 26707 non-null object dtypes: float64(10), object(27) memory usage: 15.5+ MB
Feature Corelation¶
correlation_matrix_ = final_df.corr()
C:\Users\kevin\AppData\Local\Temp\ipykernel_25912\3626133691.py:1: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning. correlation_matrix_ = final_df.corr()
Identify Pairs with Correlation within 0.70¶
threshold = 0.70
corr_pairs = correlation_matrix_.abs().unstack().sort_values(kind="quicksort", ascending=False)
filtered_pairs = corr_pairs[(corr_pairs >= threshold) & (corr_pairs < 1)]
filtered_pairs
Series([], dtype: float64)
print("Highly correlated pairs:")
for index, value in filtered_pairs.items():
print(f"{index[0]} and {index[1]}: {value}")
Highly correlated pairs:
Feature Preprocessing¶
Scaling: Transform all numerical features to be on the same scale. This matters when using regularization, which we will discuss in the next section. We will use StandardScaler, also known as Z-score scaling. This scales and shifts features so that they have zero mean and unit variance.
Dummifying: All categorical columns need to be dummified
y_col = final_df[['Y']].columns.values.tolist()
feat_col = ['feat']
numerical_cols = final_df.columns[
((~final_df.dtypes.isin(['object','category']))&(~final_df.columns.isin(y_col+feat_col)))
].values.tolist()
categorical_cols = final_df.columns[
((final_df.dtypes.isin(['object','category']))&(~final_df.columns.isin(y_col+feat_col)))
].values.tolist()
numeric_scaled = pd.DataFrame(preprocessing.scale(np.asarray(final_df[numerical_cols])))
numeric_scaled.columns = numerical_cols
numeric_scaled#.head()
h1n1_concern | h1n1_knowledge | opinion_h1n1_vacc_effective | opinion_h1n1_risk | opinion_h1n1_sick_from_vacc | opinion_seas_vacc_effective | opinion_seas_risk | opinion_seas_sick_from_vacc | household_adults | household_children | |
---|---|---|---|---|---|---|---|---|---|---|
0 | -0.687156 | -2.051332 | -0.849713 | -1.045526 | -0.261828 | -1.882737 | -1.240387 | -0.096883 | -1.187639 | -0.575314 |
1 | 1.522093 | 1.196734 | 1.149910 | 1.313000 | 1.217575 | -0.023220 | -0.510912 | 1.414956 | -1.187639 | -0.575314 |
2 | -0.687156 | -0.427299 | -0.849713 | -1.045526 | -1.001529 | -0.023220 | -1.240387 | -0.096883 | 1.477084 | -0.575314 |
3 | -0.687156 | -0.427299 | -0.849713 | 0.526825 | 1.957276 | 0.906538 | 0.948037 | -0.852803 | -1.187639 | -0.575314 |
4 | 0.417468 | -0.427299 | -0.849713 | 0.526825 | -0.261828 | -0.952978 | -1.240387 | 1.414956 | 0.144723 | -0.575314 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
53410 | -0.687156 | -0.427299 | 0.150098 | -0.259350 | -0.261828 | -0.023220 | -0.510912 | -0.852803 | 0.144723 | 0.501284 |
53411 | 1.522093 | -0.427299 | 0.150098 | -1.045526 | -1.001529 | 0.906538 | -0.510912 | -0.096883 | 0.144723 | 2.654480 |
53412 | -1.791781 | -0.427299 | 0.150098 | 0.526825 | -1.001529 | -0.023220 | 0.218562 | -0.852803 | 0.144723 | -0.575314 |
53413 | 1.522093 | -0.427299 | -1.849525 | 0.526825 | 1.217575 | -0.023220 | 0.218562 | -0.096883 | 0.144723 | -0.575314 |
53414 | 0.417468 | -0.427299 | 1.149910 | -1.045526 | -0.261828 | 0.906538 | 0.948037 | -0.096883 | -1.187639 | -0.575314 |
53415 rows × 10 columns
dummied_df = pd.get_dummies(final_df[categorical_cols], drop_first=True)
dummied_df.head()
behavioral_antiviral_meds_No response | behavioral_antiviral_meds_True | behavioral_avoidance_No response | behavioral_avoidance_True | behavioral_face_mask_No response | behavioral_face_mask_True | behavioral_wash_hands_No response | behavioral_wash_hands_True | behavioral_large_gatherings_No response | behavioral_large_gatherings_True | behavioral_outside_home_No response | behavioral_outside_home_True | behavioral_touch_face_No response | behavioral_touch_face_True | doctor_recc_h1n1_No response | doctor_recc_h1n1_True | doctor_recc_seasonal_No response | doctor_recc_seasonal_True | chronic_med_condition_No response | chronic_med_condition_True | child_under_6_months_No response | child_under_6_months_True | health_worker_No response | health_worker_True | health_insurance_No response | health_insurance_True | age_group_35 - 44 Years | age_group_45 - 54 Years | age_group_55 - 64 Years | age_group_65+ Years | education_< 12 Years | education_College Graduate | education_No response | education_Some College | race_Hispanic | race_Other or Multiple | race_White | sex_Male | income_poverty_> $75,000 | income_poverty_Below Poverty | income_poverty_No response | marital_status_No response | marital_status_Not Married | rent_or_own_Own | rent_or_own_Rent | employment_status_No response | employment_status_Not in Labor Force | employment_status_Unemployed | hhs_geo_region_bhuqouqj | hhs_geo_region_dqpwygqj | ... | hhs_geo_region_lzgpxyit | hhs_geo_region_mlyzmhmf | hhs_geo_region_oxchjgsf | hhs_geo_region_qufhixun | census_msa_MSA, Principle City | census_msa_Non-MSA | employment_industry_arjwrbjb | employment_industry_atmlpfrs | employment_industry_cfqqtusy | employment_industry_dotnnunm | employment_industry_fcxhlnwr | employment_industry_haxffmxo | employment_industry_ldnlellj | employment_industry_mcubkhph | employment_industry_mfikgejo | employment_industry_msuufmds | employment_industry_nduyfdeo | employment_industry_phxvnwax | employment_industry_pxcmvdjn | employment_industry_qnlwzans | employment_industry_rucpziij | employment_industry_saaquncn | employment_industry_vjjrobsf | employment_industry_wlfvacwt | employment_industry_wxleyezf | employment_industry_xicduogh | employment_industry_xqicxuve | employment_occupation_bxpfxfdn | employment_occupation_ccgxvspp | employment_occupation_cmhcxjea | employment_occupation_dcjcmpih | employment_occupation_dlvbwzss | employment_occupation_emcorrxb | employment_occupation_haliazsg | employment_occupation_hfxkjkmi | employment_occupation_hodpvpew | employment_occupation_kldqjyjy | employment_occupation_mxkfnird | employment_occupation_oijqvulv | employment_occupation_pvmttkik | employment_occupation_qxajmpny | employment_occupation_rcertsgn | employment_occupation_tfqavkke | employment_occupation_ukymxvdu | employment_occupation_uqqtjvyb | employment_occupation_vlluhbov | employment_occupation_xgwztkwe | employment_occupation_xqwwgdyp | employment_occupation_xtkaffoo | employment_occupation_xzmlyyjv | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
2 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
3 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 rows × 103 columns
X_df_ = pd.concat([numeric_scaled,dummied_df, final_df[['feat']]], axis=1)
X_df = X_df_[X_df_['feat']=='Train'].drop(columns='feat')
X_columns = X_df.columns.tolist()
Y_df = final_df[y_col][:len(X_df)].astype(int).astype('category').copy()
# Y_df#.head()
X = X_df.copy()
Y = Y_df.squeeze()
print(X.shape, Y.shape)
(26707, 113) (26707,)
Notice that we have 114 columns, after dummifying the categorical columns, and dropping the first column.
Also NOTE that not all of these 114 columns will be necessary features for implementing in our final model. Some of these features are unnecessary in determining whether the respondent received both vaccines or NOT. These unnecessary features will not be included in the final model.
In order to determine these features and their importance, a LASSO logistic regression would be what's used to determine each feature importance.
Feature importance - Lasso regression¶
n = n_features
kf = KFold(n_splits=n)
offset = 0.000000001
number_of_steps = 50
maxlambda = 2.5
step = maxlambda / number_of_steps
unique_lambdas = np.arange(0 + offset, maxlambda + step, step)
save_avg_coef = []
save_avg_intercept = []
for the_lambda in unique_lambdas:
sum_coef = [0] * len(X_columns)
sum_intercept = 0
for train_index, test_index in kf.split(X):
X_train, X_test = X, X
Y_train, Y_test = Y, Y
clf = linear_model.LogisticRegression(C = the_lambda, max_iter=1000)
clf.fit(X_train, Y_train)
for i in range(len(clf.coef_[0])):
sum_coef[i] += clf.coef_[0][i]
sum_intercept += clf.intercept_
avg_coef = [0] * len(X_columns)
for i in range(len(sum_coef)):
avg_coef[i] = sum_coef[i] / n
save_avg_coef.append(avg_coef)
avg_intercept = sum_intercept / n
save_avg_intercept.append(avg_intercept)
#LASSO PLOT, FULL
plt.figure(figsize=(14, 8))
ax = plt.gca()
ax.plot(unique_lambdas, save_avg_coef)
ax.set_xscale('log')
ax.set_xlim(ax.get_xlim()[::-1])
plt.xlabel('lambda')
plt.ylabel('weights')
plt.title('Lasso coefficients ALL')
plt.axis('tight')
plt.show()
pred_index = 38
display("PREDICTOR LAMBDA:", unique_lambdas[pred_index])
'PREDICTOR LAMBDA:'
1.9000000010000002
coeff_df = pd.DataFrame(columns=['ColName','Coeff'])
coeff_df['ColName'] = X_columns
coeff_df['Coeff'] = save_avg_coef[pred_index]
coeff_df['Magnitude'] = np.where(
coeff_df['Coeff']<0, coeff_df['Coeff']*-1, coeff_df['Coeff']
)
coeff_df = coeff_df.sort_values(by='Magnitude',ascending=False)
coeff_df
ColName | Coeff | Magnitude | |
---|---|---|---|
25 | doctor_recc_h1n1_True | 1.548399 | 1.548399 |
74 | employment_industry_haxffmxo | 1.410665 | 1.410665 |
93 | employment_occupation_dcjcmpih | 1.410665 | 1.410665 |
10 | behavioral_antiviral_meds_No response | 0.882929 | 0.882929 |
34 | health_insurance_No response | -0.860136 | 0.860136 |
... | ... | ... | ... |
8 | household_adults | -0.034367 | 0.034367 |
0 | h1n1_concern | -0.016748 | 0.016748 |
21 | behavioral_outside_home_True | -0.011060 | 0.011060 |
92 | employment_occupation_cmhcxjea | -0.006870 | 0.006870 |
13 | behavioral_avoidance_True | -0.002703 | 0.002703 |
113 rows × 3 columns
We are going to choose the top 20 and bottom 20 predictors by feature importance directionality / magnitude.¶
This will help to reduce the number of predictors used from the initial (over 100 features) to the chosen (40 features). our model will therefore run much slower than if all predictors would have been used in the final model.
chosen_predictors = coeff_df[:40].reset_index(drop=True)
chosen_predictors
ColName | Coeff | Magnitude | |
---|---|---|---|
0 | doctor_recc_h1n1_True | 1.548399 | 1.548399 |
1 | employment_industry_haxffmxo | 1.410665 | 1.410665 |
2 | employment_occupation_dcjcmpih | 1.410665 | 1.410665 |
3 | behavioral_antiviral_meds_No response | 0.882929 | 0.882929 |
4 | health_insurance_No response | -0.860136 | 0.860136 |
5 | employment_occupation_qxajmpny | -0.847009 | 0.847009 |
6 | health_worker_True | 0.787104 | 0.787104 |
7 | health_insurance_True | 0.785535 | 0.785535 |
8 | employment_occupation_tfqavkke | -0.725048 | 0.725048 |
9 | employment_occupation_uqqtjvyb | -0.713886 | 0.713886 |
10 | age_group_65+ Years | 0.656117 | 0.656117 |
11 | employment_industry_phxvnwax | 0.584773 | 0.584773 |
12 | employment_industry_arjwrbjb | 0.570566 | 0.570566 |
13 | employment_industry_xicduogh | 0.551852 | 0.551852 |
14 | employment_occupation_xgwztkwe | -0.538446 | 0.538446 |
15 | age_group_55 - 64 Years | 0.536462 | 0.536462 |
16 | employment_industry_fcxhlnwr | 0.535500 | 0.535500 |
17 | race_Other or Multiple | 0.533174 | 0.533174 |
18 | employment_status_No response | 0.518320 | 0.518320 |
19 | education_No response | 0.513174 | 0.513174 |
20 | child_under_6_months_No response | 0.492454 | 0.492454 |
21 | race_White | 0.492303 | 0.492303 |
22 | opinion_h1n1_vacc_effective | 0.490032 | 0.490032 |
23 | behavioral_face_mask_No response | 0.489136 | 0.489136 |
24 | opinion_h1n1_risk | 0.470053 | 0.470053 |
25 | rent_or_own_Rent | -0.469763 | 0.469763 |
26 | behavioral_outside_home_No response | -0.448110 | 0.448110 |
27 | employment_industry_msuufmds | 0.426944 | 0.426944 |
28 | employment_industry_nduyfdeo | 0.391660 | 0.391660 |
29 | employment_occupation_oijqvulv | -0.386089 | 0.386089 |
30 | employment_industry_wxleyezf | 0.379052 | 0.379052 |
31 | employment_occupation_pvmttkik | -0.377823 | 0.377823 |
32 | race_Hispanic | 0.353311 | 0.353311 |
33 | employment_industry_saaquncn | 0.343572 | 0.343572 |
34 | rent_or_own_Own | -0.338613 | 0.338613 |
35 | opinion_seas_risk | 0.334938 | 0.334938 |
36 | employment_occupation_mxkfnird | -0.318211 | 0.318211 |
37 | employment_industry_vjjrobsf | -0.303148 | 0.303148 |
38 | hhs_geo_region_dqpwygqj | -0.284656 | 0.284656 |
39 | employment_occupation_rcertsgn | -0.283871 | 0.283871 |
positive_color = 'skyblue'
negative_color = 'salmon'
colors = [positive_color if coeff > 0 else negative_color for coeff in chosen_predictors['Coeff'].tolist()]
plt.figure(figsize=(13, 13))
bars = plt.barh(chosen_predictors['ColName'].tolist(), chosen_predictors['Coeff'].tolist(), color=colors)
plt.xlabel('Coefficient Value')
plt.ylabel('Feature')
plt.title('Coefficients of ML Model')
for bar, coeff in zip(bars, chosen_predictors['Coeff'].tolist()):
plt.text(bar.get_width() * 1.05, bar.get_y() + bar.get_height() / 2, f'{coeff:.2f}', ha='left', va='center')
plt.gca().invert_yaxis()
plt.show()
Final Model¶
X_df_new = X_df[chosen_predictors['ColName'].tolist()]
x = np.asarray(X_df_new.copy())
y = np.asarray(Y)
X_train, X_test, y_train, y_test = train_test_split(x,y,test_size = 0.2 ,random_state = randomstate)
models = {
"Logit CV No Penalty": LogisticRegressionCV(random_state=randomstate, cv=cv, max_iter=5000),
"Logit CV L1": LogisticRegressionCV(penalty='l1', random_state=randomstate, cv=cv, solver='liblinear'),
"Logit CV L2": LogisticRegressionCV(penalty='l2', random_state=randomstate, cv=cv, solver='liblinear'),
"Logit CV ElasticNet": LogisticRegressionCV(penalty='elasticnet', random_state=randomstate, cv=cv,
solver='saga', l1_ratios=np.linspace(0.1,0.9,5), max_iter=5000),
"Logit No Penalty": LogisticRegression(random_state=randomstate, max_iter=5000),
"Logit L1": LogisticRegression(penalty='l1', solver='liblinear', random_state=randomstate),
"Logit L2": LogisticRegression(penalty='l2', solver='liblinear', random_state=randomstate),
# "Logit ElasticNet": LogisticRegression(penalty='elasticnet', solver='saga', random_state=randomstate,
# l1_ratios=np.linspace(0.1,0.9,5), max_iter=5000),
"SGD Classifier No Penalty": SGDClassifier(loss='log_loss', alpha=0.001, max_iter=1000, random_state=randomstate),
"SGD Classifier L1 Penalty": SGDClassifier(loss='log_loss', alpha=0.001, max_iter=1000, random_state=randomstate, penalty='l1'),
"SGD Classifier L2 Penalty": SGDClassifier(loss='log_loss', alpha=0.001, max_iter=1000, random_state=randomstate, penalty='l2'),
"SGD Classifier ElasticNet Penalty": SGDClassifier(
loss='log_loss', alpha=0.0001, max_iter=1000, random_state=randomstate, penalty='elasticnet'),
"DecisionTree Classifier Entropy": DecisionTreeClassifier(criterion='entropy',min_samples_leaf=10, random_state=randomstate),
"DecisionTree Classifier Gini": DecisionTreeClassifier(criterion='gini',min_samples_leaf=10, random_state=randomstate),
"SVM RBF": svm.SVC(kernel='rbf', C=20, random_state=randomstate),
"SVM Poly": svm.SVC(kernel='poly', C=20, random_state=randomstate),
}
df_predictions = pd.DataFrame(columns=['Y','Yhat','Model'])
for each, model in models.items():
print(f"Running {str(each)} model currently....")
model.fit(X_train, y_train)
predictions = model.predict(X_test)
model_predictions = pd.DataFrame()
model_predictions['Y'] = y_test.tolist()
model_predictions['Yhat'] = predictions.tolist()
model_predictions['Model'] = str(each)
df_predictions = pd.concat([df_predictions,model_predictions])
# print(df_predictions)
Running Logit CV No Penalty model currently.... Running Logit CV L1 model currently.... Running Logit CV L2 model currently.... Running Logit CV ElasticNet model currently.... Running Logit No Penalty model currently.... Running Logit L1 model currently.... Running Logit L2 model currently.... Running SGD Classifier No Penalty model currently.... Running SGD Classifier L1 Penalty model currently.... Running SGD Classifier L2 Penalty model currently.... Running SGD Classifier ElasticNet Penalty model currently.... Running DecisionTree Classifier Entropy model currently.... Running DecisionTree Classifier Gini model currently.... Running SVM RBF model currently.... Running SVM Poly model currently....
df_predictions['correct_pred'] = np.where(
df_predictions['Y']==df_predictions['Yhat'], 'Correct Prediction', 'Incorrect Prediction'
)
df_predictions
Y | Yhat | Model | correct_pred | |
---|---|---|---|---|
0 | 0 | 0 | Logit CV No Penalty | Correct Prediction |
1 | 0 | 0 | Logit CV No Penalty | Correct Prediction |
2 | 0 | 0 | Logit CV No Penalty | Correct Prediction |
3 | 0 | 0 | Logit CV No Penalty | Correct Prediction |
4 | 0 | 0 | Logit CV No Penalty | Correct Prediction |
... | ... | ... | ... | ... |
5337 | 1 | 0 | SVM Poly | Incorrect Prediction |
5338 | 0 | 0 | SVM Poly | Correct Prediction |
5339 | 0 | 1 | SVM Poly | Incorrect Prediction |
5340 | 0 | 0 | SVM Poly | Correct Prediction |
5341 | 0 | 0 | SVM Poly | Correct Prediction |
80130 rows × 4 columns
correct_predictions = df_predictions.groupby(['Model','correct_pred']).size().reset_index(name='counts').sort_values(
by=['counts','Model'])
correct_predictions
Model | correct_pred | counts | |
---|---|---|---|
7 | Logit CV L1 | Incorrect Prediction | 691 |
9 | Logit CV L2 | Incorrect Prediction | 691 |
11 | Logit CV No Penalty | Incorrect Prediction | 692 |
17 | Logit No Penalty | Incorrect Prediction | 692 |
15 | Logit L2 | Incorrect Prediction | 693 |
13 | Logit L1 | Incorrect Prediction | 694 |
23 | SGD Classifier L2 Penalty | Incorrect Prediction | 695 |
25 | SGD Classifier No Penalty | Incorrect Prediction | 695 |
5 | Logit CV ElasticNet | Incorrect Prediction | 698 |
27 | SVM Poly | Incorrect Prediction | 706 |
21 | SGD Classifier L1 Penalty | Incorrect Prediction | 707 |
1 | DecisionTree Classifier Entropy | Incorrect Prediction | 719 |
3 | DecisionTree Classifier Gini | Incorrect Prediction | 720 |
29 | SVM RBF | Incorrect Prediction | 724 |
19 | SGD Classifier ElasticNet Penalty | Incorrect Prediction | 734 |
18 | SGD Classifier ElasticNet Penalty | Correct Prediction | 4608 |
28 | SVM RBF | Correct Prediction | 4618 |
2 | DecisionTree Classifier Gini | Correct Prediction | 4622 |
0 | DecisionTree Classifier Entropy | Correct Prediction | 4623 |
20 | SGD Classifier L1 Penalty | Correct Prediction | 4635 |
26 | SVM Poly | Correct Prediction | 4636 |
4 | Logit CV ElasticNet | Correct Prediction | 4644 |
22 | SGD Classifier L2 Penalty | Correct Prediction | 4647 |
24 | SGD Classifier No Penalty | Correct Prediction | 4647 |
12 | Logit L1 | Correct Prediction | 4648 |
14 | Logit L2 | Correct Prediction | 4649 |
10 | Logit CV No Penalty | Correct Prediction | 4650 |
16 | Logit No Penalty | Correct Prediction | 4650 |
6 | Logit CV L1 | Correct Prediction | 4651 |
8 | Logit CV L2 | Correct Prediction | 4651 |
grouped_df = correct_predictions.groupby(['correct_pred', 'Model'])['counts'].sum().unstack().reset_index()
for col in grouped_df.columns[1:]:
grouped_df[col] = grouped_df[col] / grouped_df[col].sum()
# Reshape the DataFrame for sorting
stacked_df = grouped_df.set_index('correct_pred').stack().reset_index(name='percentage')
# Sort by percentage values
stacked_df = stacked_df.sort_values(by='percentage', ascending=False)
# Reconstruct the DataFrame to have 'correct_pred' as columns
grouped_df_sorted = stacked_df.pivot_table(index='correct_pred', columns='Model', values='percentage').reset_index()
fig, ax = plt.subplots(figsize=(10, 8))
category_values = grouped_df_sorted.columns[1:]
colors = {'Incorrect Prediction': '#FF6347', 'Correct Prediction': '#32CD32'}
bar_positions = [i for i in range(len(category_values))]
for i, cat1 in enumerate(grouped_df_sorted['correct_pred'].unique()):
values = grouped_df_sorted[grouped_df_sorted['correct_pred'] == cat1][category_values].values[0]
if i == 0:
left = None
else:
left = grouped_df_sorted[grouped_df_sorted['correct_pred'].isin(grouped_df_sorted['correct_pred'].unique()[:i])][category_values].sum().tolist()
bars = ax.barh(bar_positions, values, left=left, label=cat1, color=colors[cat1])
for j, (bar, value) in enumerate(zip(bars, values)):
ax.text(bar.get_width() / 2 + (left[j] if left else 0), bar.get_y() + bar.get_height() / 2,
f'{value*100:.2f}%', ha='center', va='center', color='white')
ax.set_yticks(bar_positions)
ax.set_yticklabels(category_values)
ax.set_xlabel('% of Predictions')
ax.set_title('Correct VS Incorrect Predictions by model', pad=20)
ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.15), ncol=2)
plt.tight_layout()
plt.show()
df_metrics = pd.DataFrame()
for each in df_predictions['Model'].unique().tolist():
model_name = each
df_model = df_predictions[df_predictions['Model']==each]
precision, recall, f1_score, _ = precision_recall_fscore_support(df_model['Y'].tolist(),
df_model['Yhat'].tolist())
metrics_df = pd.DataFrame({
'Precision': precision,
'Recall': recall,
'F1-score': f1_score
})
metrics_df.index.name = 'Category'
metrics_df.reset_index(inplace=True)
metrics_df['Model'] = each
df_metrics = pd.concat([df_metrics,metrics_df])
df_metrics['Category'] = np.where(
df_metrics['Category'] == 1, "Vaccinated", "Not Vaccinated"
)
df_metrics
Category | Precision | Recall | F1-score | Model | |
---|---|---|---|---|---|
0 | Not Vaccinated | 0.891222 | 0.960616 | 0.924619 | Logit CV No Penalty |
1 | Vaccinated | 0.700000 | 0.439394 | 0.539894 | Logit CV No Penalty |
0 | Not Vaccinated | 0.891245 | 0.960842 | 0.924736 | Logit CV L1 |
1 | Vaccinated | 0.701209 | 0.439394 | 0.540253 | Logit CV L1 |
0 | Not Vaccinated | 0.891245 | 0.960842 | 0.924736 | Logit CV L2 |
1 | Vaccinated | 0.701209 | 0.439394 | 0.540253 | Logit CV L2 |
0 | Not Vaccinated | 0.888796 | 0.962426 | 0.924147 | Logit CV ElasticNet |
1 | Vaccinated | 0.702509 | 0.424242 | 0.529015 | Logit CV ElasticNet |
0 | Not Vaccinated | 0.891222 | 0.960616 | 0.924619 | Logit No Penalty |
1 | Vaccinated | 0.700000 | 0.439394 | 0.539894 | Logit No Penalty |
0 | Not Vaccinated | 0.891176 | 0.960163 | 0.924384 | Logit L1 |
1 | Vaccinated | 0.697595 | 0.439394 | 0.539177 | Logit L1 |
0 | Not Vaccinated | 0.891199 | 0.960389 | 0.924502 | Logit L2 |
1 | Vaccinated | 0.698795 | 0.439394 | 0.539535 | Logit L2 |
0 | Not Vaccinated | 0.886123 | 0.966953 | 0.924775 | SGD Classifier No Penalty |
1 | Vaccinated | 0.719770 | 0.405844 | 0.519031 | SGD Classifier No Penalty |
0 | Not Vaccinated | 0.885038 | 0.965369 | 0.923460 | SGD Classifier L1 Penalty |
1 | Vaccinated | 0.707457 | 0.400433 | 0.511403 | SGD Classifier L1 Penalty |
0 | Not Vaccinated | 0.886123 | 0.966953 | 0.924775 | SGD Classifier L2 Penalty |
1 | Vaccinated | 0.719770 | 0.405844 | 0.519031 | SGD Classifier L2 Penalty |
0 | Not Vaccinated | 0.909698 | 0.925758 | 0.917658 | SGD Classifier ElasticNet Penalty |
1 | Vaccinated | 0.612293 | 0.560606 | 0.585311 | SGD Classifier ElasticNet Penalty |
0 | Not Vaccinated | 0.891926 | 0.952694 | 0.921309 | DecisionTree Classifier Entropy |
1 | Vaccinated | 0.664526 | 0.448052 | 0.535229 | DecisionTree Classifier Entropy |
0 | Not Vaccinated | 0.891074 | 0.953599 | 0.921277 | DecisionTree Classifier Gini |
1 | Vaccinated | 0.666124 | 0.442641 | 0.531860 | DecisionTree Classifier Gini |
0 | Not Vaccinated | 0.889662 | 0.954504 | 0.920943 | SVM RBF |
1 | Vaccinated | 0.666113 | 0.433983 | 0.525557 | SVM RBF |
0 | Not Vaccinated | 0.892887 | 0.954731 | 0.922774 | SVM Poly |
1 | Vaccinated | 0.676375 | 0.452381 | 0.542153 | SVM Poly |
droput_metrics = df_metrics[df_metrics['Category']=='Vaccinated'].reset_index(drop=True)
droput_metrics.rename(columns={'Precision':'Vaccinated_Precision',
'Recall':'Vaccinated_Recall',
'F1-score':'Vaccinated_F1'},inplace=True)
droput_metrics = droput_metrics.drop(columns='Category')
droput_metrics
Vaccinated_Precision | Vaccinated_Recall | Vaccinated_F1 | Model | |
---|---|---|---|---|
0 | 0.700000 | 0.439394 | 0.539894 | Logit CV No Penalty |
1 | 0.701209 | 0.439394 | 0.540253 | Logit CV L1 |
2 | 0.701209 | 0.439394 | 0.540253 | Logit CV L2 |
3 | 0.702509 | 0.424242 | 0.529015 | Logit CV ElasticNet |
4 | 0.700000 | 0.439394 | 0.539894 | Logit No Penalty |
5 | 0.697595 | 0.439394 | 0.539177 | Logit L1 |
6 | 0.698795 | 0.439394 | 0.539535 | Logit L2 |
7 | 0.719770 | 0.405844 | 0.519031 | SGD Classifier No Penalty |
8 | 0.707457 | 0.400433 | 0.511403 | SGD Classifier L1 Penalty |
9 | 0.719770 | 0.405844 | 0.519031 | SGD Classifier L2 Penalty |
10 | 0.612293 | 0.560606 | 0.585311 | SGD Classifier ElasticNet Penalty |
11 | 0.664526 | 0.448052 | 0.535229 | DecisionTree Classifier Entropy |
12 | 0.666124 | 0.442641 | 0.531860 | DecisionTree Classifier Gini |
13 | 0.666113 | 0.433983 | 0.525557 | SVM RBF |
14 | 0.676375 | 0.452381 | 0.542153 | SVM Poly |
nodroput_metrics = df_metrics[df_metrics['Category']=='Not Vaccinated'].reset_index(drop=True)
nodroput_metrics.rename(columns={'Precision':'Not_Vaccinated_Precision',
'Recall':'Not_Vaccinated_Recall',
'F1-score':'Not_Vaccinated_F1'},inplace=True)
nodroput_metrics = nodroput_metrics.drop(columns='Category')
nodroput_metrics
Not_Vaccinated_Precision | Not_Vaccinated_Recall | Not_Vaccinated_F1 | Model | |
---|---|---|---|---|
0 | 0.891222 | 0.960616 | 0.924619 | Logit CV No Penalty |
1 | 0.891245 | 0.960842 | 0.924736 | Logit CV L1 |
2 | 0.891245 | 0.960842 | 0.924736 | Logit CV L2 |
3 | 0.888796 | 0.962426 | 0.924147 | Logit CV ElasticNet |
4 | 0.891222 | 0.960616 | 0.924619 | Logit No Penalty |
5 | 0.891176 | 0.960163 | 0.924384 | Logit L1 |
6 | 0.891199 | 0.960389 | 0.924502 | Logit L2 |
7 | 0.886123 | 0.966953 | 0.924775 | SGD Classifier No Penalty |
8 | 0.885038 | 0.965369 | 0.923460 | SGD Classifier L1 Penalty |
9 | 0.886123 | 0.966953 | 0.924775 | SGD Classifier L2 Penalty |
10 | 0.909698 | 0.925758 | 0.917658 | SGD Classifier ElasticNet Penalty |
11 | 0.891926 | 0.952694 | 0.921309 | DecisionTree Classifier Entropy |
12 | 0.891074 | 0.953599 | 0.921277 | DecisionTree Classifier Gini |
13 | 0.889662 | 0.954504 | 0.920943 | SVM RBF |
14 | 0.892887 | 0.954731 | 0.922774 | SVM Poly |
metrics_df = pd.merge(nodroput_metrics, droput_metrics, on=['Model'])
metrics_df
Not_Vaccinated_Precision | Not_Vaccinated_Recall | Not_Vaccinated_F1 | Model | Vaccinated_Precision | Vaccinated_Recall | Vaccinated_F1 | |
---|---|---|---|---|---|---|---|
0 | 0.891222 | 0.960616 | 0.924619 | Logit CV No Penalty | 0.700000 | 0.439394 | 0.539894 |
1 | 0.891245 | 0.960842 | 0.924736 | Logit CV L1 | 0.701209 | 0.439394 | 0.540253 |
2 | 0.891245 | 0.960842 | 0.924736 | Logit CV L2 | 0.701209 | 0.439394 | 0.540253 |
3 | 0.888796 | 0.962426 | 0.924147 | Logit CV ElasticNet | 0.702509 | 0.424242 | 0.529015 |
4 | 0.891222 | 0.960616 | 0.924619 | Logit No Penalty | 0.700000 | 0.439394 | 0.539894 |
5 | 0.891176 | 0.960163 | 0.924384 | Logit L1 | 0.697595 | 0.439394 | 0.539177 |
6 | 0.891199 | 0.960389 | 0.924502 | Logit L2 | 0.698795 | 0.439394 | 0.539535 |
7 | 0.886123 | 0.966953 | 0.924775 | SGD Classifier No Penalty | 0.719770 | 0.405844 | 0.519031 |
8 | 0.885038 | 0.965369 | 0.923460 | SGD Classifier L1 Penalty | 0.707457 | 0.400433 | 0.511403 |
9 | 0.886123 | 0.966953 | 0.924775 | SGD Classifier L2 Penalty | 0.719770 | 0.405844 | 0.519031 |
10 | 0.909698 | 0.925758 | 0.917658 | SGD Classifier ElasticNet Penalty | 0.612293 | 0.560606 | 0.585311 |
11 | 0.891926 | 0.952694 | 0.921309 | DecisionTree Classifier Entropy | 0.664526 | 0.448052 | 0.535229 |
12 | 0.891074 | 0.953599 | 0.921277 | DecisionTree Classifier Gini | 0.666124 | 0.442641 | 0.531860 |
13 | 0.889662 | 0.954504 | 0.920943 | SVM RBF | 0.666113 | 0.433983 | 0.525557 |
14 | 0.892887 | 0.954731 | 0.922774 | SVM Poly | 0.676375 | 0.452381 | 0.542153 |
models = metrics_df['Model'].tolist()
precision_dropout = metrics_df['Vaccinated_Precision'].tolist()
recall_dropout = metrics_df['Vaccinated_Recall'].tolist()
f_score_dropout = metrics_df['Vaccinated_F1'].tolist()
precision_not_dropout = metrics_df['Not_Vaccinated_Precision'].tolist()
recall_not_dropout = metrics_df['Not_Vaccinated_Recall'].tolist()
f_score_not_dropout = metrics_df['Not_Vaccinated_F1'].tolist()
# Create DataFrames for the data
data_dropout = {
'Model': models,
'Precision': precision_dropout,
'Recall': recall_dropout,
'F-score': f_score_dropout,
'Category': 'Vaccinated'
}
data_not_dropout = {
'Model': models,
'Precision': precision_not_dropout,
'Recall': recall_not_dropout,
'F-score': f_score_not_dropout,
'Category': 'Not Vaccinated'
}
df_dropout = pd.DataFrame(data_dropout)
df_not_dropout = pd.DataFrame(data_not_dropout)
# Concatenate the two DataFrames
df = pd.concat([df_dropout, df_not_dropout])
# Sort the DataFrame based on F-score values
df_sorted = df.sort_values(by='F-score', ascending=True)
# Melt the DataFrame for plotting
df_melted = df_sorted.melt(id_vars=['Model', 'Category'], var_name='Metric', value_name='Score')
# df_melted
# Plotting
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(14, 8), sharey=True)
for i, ax in enumerate(axes):
if i == 0:
title = 'Vaccinated'
else:
title = 'Not Vaccinated'
sns.pointplot(data=df_melted[df_melted['Category'] == title], x='Score', y='Model', hue='Metric', dodge=True, join=False, markers=['o', 's', '^'], errorbar=None, ax=ax, linestyles='')
ax.errorbar(x=df_melted[df_melted['Category'] == title]['Score'], y=df_melted[df_melted['Category'] == title]['Model'], xerr=np.std(df_melted[df_melted['Category'] == title]['Score']), fmt='none', color='black', zorder=-1)
ax.set_xlabel('Score')
ax.set_ylabel('')
ax.set_title(title)
ax.grid(True)
ax.get_legend().remove() # Remove legend from subplots
ax.xaxis.grid(False) # Remove horizontal lines connecting the points
# Consolidated legend
handles, labels = axes[1].get_legend_handles_labels()
fig.legend(handles, labels, loc='lower center', bbox_to_anchor=(0.5, -0.05), ncol=3, title='Metric')
plt.suptitle('Performance Metrics by Model and Vaccination', y=1.05)
plt.tight_layout()
plt.show()
Due to the rigidity of the SGDClassifier with the L2 penalty in predicting Vaccinated respondents (Precision, Recall and F1-score) being close together, this is the model that I am going to choose.
SGDClassifier Model¶
X_train2, X_test2, y_train2, y_test2 = train_test_split(X,Y,test_size = 0.3 ,random_state = randomstate)
mdl = SGDClassifier(loss='log_loss', alpha=0.001, max_iter=1000, random_state=randomstate, penalty='l2')
mdl.fit(X_train2, y_train2)
BZ = mdl.coef_
# predict data and analyze result
Yh = mdl.predict(X)
confusion_matrix = pd.crosstab(Y, Yh,rownames=['Actual'], colnames=['Predicted'])
sns.heatmap(confusion_matrix, annot=True)
plt.show()
Stats = classification_report(Y, Yh)
print(Stats)
precision recall f1-score support 0 0.89 0.97 0.92 22010 1 0.72 0.43 0.54 4697 accuracy 0.87 26707 macro avg 0.81 0.70 0.73 26707 weighted avg 0.86 0.87 0.86 26707
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, roc_auc_score
# chain preprocessing into a Pipeline object
# each step is a tuple of (name you chose, sklearn transformer)
numeric_preprocessing_steps = Pipeline([
('standard_scaler', StandardScaler()),
('simple_imputer', SimpleImputer(strategy='median'))
])
# create the preprocessor stage of final pipeline
# each entry in the transformer list is a tuple of
# (name you choose, sklearn transformer, list of columns)
preprocessor = ColumnTransformer(
transformers = [
("numeric", numeric_preprocessing_steps, numerical_cols)
],
remainder = "drop"
)
estimators = MultiOutputClassifier(
estimator=SGDClassifier(loss='log_loss', alpha=0.001, max_iter=1000, random_state=randomstate, penalty='l2')
)
full_pipeline = Pipeline([
("preprocessor", preprocessor),
("estimators", estimators),
])
X_train, X_eval, y_train, y_eval = train_test_split(
features_df[features_df['feat']=='Train'].drop(columns=['feat']),
labels_df.drop(columns=['VaccineCategory','Y','One_vaccine_only']),
test_size=0.2,
shuffle=True,
# stratify=labels_df.drop(columns=['VaccineCategory','Y','One_vaccine_only']),
random_state=randomstate
)
%%time
# Train model
full_pipeline.fit(X_train, y_train)
# Predict on evaluation set
# This competition wants probabilities, not labels
preds = full_pipeline.predict_proba(X_eval)
preds
print("test_probas[0].shape", preds[0].shape)
print("test_probas[1].shape", preds[1].shape)
y_preds = pd.DataFrame(
{
"h1n1_vaccine": preds[0][:, 1],
"seasonal_vaccine": preds[1][:, 1],
},
index = y_eval.index
)
print("y_preds.shape:", y_preds.shape)
y_preds.head()
def plot_roc(y_true, y_score, label_name, ax):
fpr, tpr, thresholds = roc_curve(y_true, y_score)
ax.plot(fpr, tpr)
ax.plot([0, 1], [0, 1], color='grey', linestyle='--')
ax.set_ylabel('TPR')
ax.set_xlabel('FPR')
ax.set_title(
f"{label_name}: AUC = {roc_auc_score(y_true, y_score):.4f}"
)
fig, ax = plt.subplots(1, 2, figsize=(7, 3.5))
plot_roc(
y_eval['h1n1_vaccine'],
y_preds['h1n1_vaccine'],
'h1n1_vaccine',
ax=ax[0]
)
plot_roc(
y_eval['seasonal_vaccine'],
y_preds['seasonal_vaccine'],
'seasonal_vaccine',
ax=ax[1]
)
fig.tight_layout()
# roc_auc_score(y_eval, y_preds)