import pandas as pd
import numpy as np
import io
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2_contingency
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error, r2_score, classification_report, precision_recall_fscore_support
from sklearn import linear_model
from sklearn.linear_model import LogisticRegressionCV, Lasso, LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm

pd.set_option("display.max_columns", 100)

randomstate = 42
cv = 10
n_features = 5

def info_to_dataframe(df):
    buffer = io.StringIO()
    df.info(buf=buffer)
    info_string = buffer.getvalue()
    info_lines = info_string.splitlines()
    data = []
    for line in info_lines[5:-2]:
        split_line = line.split(maxsplit=4)
        if len(split_line) == 5:
            non_null_count = split_line[2].replace(',', '').strip()
            dtype = split_line[4]
            null_count = len(df) - int(non_null_count)
        else:
            non_null_count = None
            dtype = None
        row_data = {
            'Column': split_line[1],
            'Non-Null Count': non_null_count,
            'Null Count': null_count,
            'Dtype': dtype
        }
        data.append(row_data)
    info_df = pd.DataFrame(data)
    return info_df.sort_values(by='Null Count', ascending=False).reset_index(drop=True)

def plot_correlation_matrix(df, column1, column2):
    mydf = df[[column1, column2]]
    correlation_matrix = mydf.corr()
    plt.figure(figsize=(6, 4))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1, square=True, cbar_kws={"shrink": .8})
    plt.title(f'Correlation Matrix between {column1} and {column2}', fontsize=14)
    plt.xticks(ticks=[0.5, 1.5], labels=[column1, column2])
    plt.yticks(ticks=[0.5, 1.5], labels=[column1, column2])
    plt.show()

def plot_pie_chart(sizes, labels, colors, title):
    plt.figure(figsize=(14, 8))
    plt.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=140)
    plt.axis('equal')
    plt.legend(loc="lower right",
               labels=[f'{category}: N={value}' for category, value in zip(
        labels_vaccine_categ, sizes_vaccine_categ)])
    plt.title(title)
    plt.show()

train_features_df = pd.read_csv('training_set_features.csv')
train_features_df['feat'] = 'Train'
train_features_df.head()

test_features_df = pd.read_csv('test_set_features.csv')
test_features_df['feat'] = 'Test'
test_features_df.head()

features_df = pd.concat([train_features_df, test_features_df])
features_df#.head()

features_df.columns

Index(['respondent_id', 'h1n1_concern', 'h1n1_knowledge',
       'behavioral_antiviral_meds', 'behavioral_avoidance',
       'behavioral_face_mask', 'behavioral_wash_hands',
       'behavioral_large_gatherings', 'behavioral_outside_home',
       'behavioral_touch_face', 'doctor_recc_h1n1', 'doctor_recc_seasonal',
       'chronic_med_condition', 'child_under_6_months', 'health_worker',
       'health_insurance', 'opinion_h1n1_vacc_effective', 'opinion_h1n1_risk',
       'opinion_h1n1_sick_from_vacc', 'opinion_seas_vacc_effective',
       'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'age_group',
       'education', 'race', 'sex', 'income_poverty', 'marital_status',
       'rent_or_own', 'employment_status', 'hhs_geo_region', 'census_msa',
       'household_adults', 'household_children', 'employment_industry',
       'employment_occupation', 'feat'],
      dtype='object')

features_df.describe()

features_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 53415 entries, 0 to 26707
Data columns (total 37 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   respondent_id                53415 non-null  int64  
 1   h1n1_concern                 53238 non-null  float64
 2   h1n1_knowledge               53177 non-null  float64
 3   behavioral_antiviral_meds    53265 non-null  float64
 4   behavioral_avoidance         52994 non-null  float64
 5   behavioral_face_mask         53377 non-null  float64
 6   behavioral_wash_hands        53333 non-null  float64
 7   behavioral_large_gatherings  53256 non-null  float64
 8   behavioral_outside_home      53251 non-null  float64
 9   behavioral_touch_face        53159 non-null  float64
 10  doctor_recc_h1n1             49095 non-null  float64
 11  doctor_recc_seasonal         49095 non-null  float64
 12  chronic_med_condition        51512 non-null  float64
 13  child_under_6_months         51782 non-null  float64
 14  health_worker                51822 non-null  float64
 15  health_insurance             28913 non-null  float64
 16  opinion_h1n1_vacc_effective  52626 non-null  float64
 17  opinion_h1n1_risk            52647 non-null  float64
 18  opinion_h1n1_sick_from_vacc  52645 non-null  float64
 19  opinion_seas_vacc_effective  52501 non-null  float64
 20  opinion_seas_risk            52402 non-null  float64
 21  opinion_seas_sick_from_vacc  52357 non-null  float64
 22  age_group                    53415 non-null  object 
 23  education                    50601 non-null  object 
 24  race                         53415 non-null  object 
 25  sex                          53415 non-null  object 
 26  income_poverty               44495 non-null  object 
 27  marital_status               50565 non-null  object 
 28  rent_or_own                  49337 non-null  object 
 29  employment_status            50481 non-null  object 
 30  hhs_geo_region               53415 non-null  object 
 31  census_msa                   53415 non-null  object 
 32  household_adults             52941 non-null  float64
 33  household_children           52941 non-null  float64
 34  employment_industry          26810 non-null  object 
 35  employment_occupation        26519 non-null  object 
 36  feat                         53415 non-null  object 
dtypes: float64(23), int64(1), object(13)
memory usage: 15.5+ MB

labels_df = pd.read_csv('training_set_labels.csv')
labels_df[['h1n1_vaccine','seasonal_vaccine']] = labels_df[
    ['h1n1_vaccine','seasonal_vaccine']].astype('category')
labels_df.head()

labels_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26707 entries, 0 to 26706
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   respondent_id     26707 non-null  int64   
 1   h1n1_vaccine      26707 non-null  category
 2   seasonal_vaccine  26707 non-null  category
dtypes: category(2), int64(1)
memory usage: 261.2 KB

labels_df['VaccineCategory'] = np.where(
    ((labels_df['h1n1_vaccine']==1)&(labels_df['seasonal_vaccine']==1)), "Both Vaccines",
    np.where(
        ((labels_df['h1n1_vaccine']==1)&(labels_df['seasonal_vaccine']==0)), "Seasonal Only",
        np.where(
            ((labels_df['h1n1_vaccine']==0)&(labels_df['seasonal_vaccine']==1)), "H1N1 Only",
            np.where(
                ((labels_df['h1n1_vaccine']==0)&(labels_df['seasonal_vaccine']==0)), "No Vaccine",
                "Unknown"
            )
        )
    )
)
df_vaccine_categ = labels_df.groupby('VaccineCategory')['respondent_id'].nunique().reset_index(
name='N').sort_values(by="N").reset_index(drop=True)
labels_vaccine_categ = df_vaccine_categ['VaccineCategory'].unique().tolist()
sizes_vaccine_categ = df_vaccine_categ['N'].tolist()
colors_vaccine_categ = ['#ADD8E6','#e57373','#81c784','#FFFF99']
plot_pie_chart(
    sizes_vaccine_categ, 
    labels=labels_vaccine_categ, 
    colors=colors_vaccine_categ,
    title = 'Respondent Distro by VaccineCategory'
)

h1n1_counts = labels_df.groupby('h1n1_vaccine')['respondent_id'].nunique()
seasonal_counts = labels_df.groupby('seasonal_vaccine')['respondent_id'].nunique()
h1n1_percent = (h1n1_counts / h1n1_counts.sum()) * 100
seasonal_percent = (seasonal_counts / seasonal_counts.sum()) * 100
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
h1n1_counts.plot(kind='barh', ax=axes[0], color='skyblue')
for idx, value in enumerate(h1n1_counts):
    axes[0].text(value - 20, idx, f'{value} ({h1n1_percent[idx]:.0f}%)', va='center', ha='right', color='white')
axes[0].set_title('H1N1 Vaccine Counts')
axes[0].set_xlabel('')
axes[0].set_ylabel('')
axes[0].tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
seasonal_counts.plot(kind='barh', ax=axes[1], color='salmon')
for idx, value in enumerate(seasonal_counts):
    axes[1].text(value - 20, idx, f'{value} ({seasonal_percent[idx]:.0f}%)', va='center', ha='right', color='white')
axes[1].set_title('Seasonal Vaccine Counts')
axes[1].set_xlabel('')
axes[1].set_ylabel('')
axes[1].tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
plt.tight_layout()
plt.show()

# Create a contingency table
contingency_table = pd.crosstab(labels_df['h1n1_vaccine'], labels_df['seasonal_vaccine'])

# Perform Chi-Square test of independence
chi2, p, dof, expected = chi2_contingency(contingency_table)

# Print the results
print(f"Chi-Square Statistic: {chi2}")
print(f"P-value: {p}")
print(f"Degrees of Freedom: {dof}")
print("Expected Frequencies Table:")
print(expected)

# Interpretation
alpha = 0.05
if p < alpha:
    print("Reject the null hypothesis. The two variables are dependent.")
else:
    print("Fail to reject the null hypothesis. The two variables are independent.")

Chi-Square Statistic: 3796.8641900765715
P-value: 0.0
Degrees of Freedom: 1
Expected Frequencies Table:
[[11239.86130977  9793.13869023]
 [ 3032.13869023  2641.86130977]]
Reject the null hypothesis. The two variables are dependent.

object_cols = []
for each in features_df.columns.tolist():
    df_each = features_df[each]
    type_col = df_each.dtype
    try:
        max_val = df_each.max()
        min_val = df_each.min()
        if min_val == 0 and max_val == 1:
            type_col = object
            object_cols.append(each)
    except:
        pass
    
    
null_object_cols = []
for each in features_df.columns.tolist():
    df_each = features_df[each]
    type_col = df_each.dtype
    if type_col == object and df_each.isna().any() and each not in object_cols:
        null_object_cols.append(each)

features_df[object_cols] = features_df[object_cols].apply(lambda x: x.map({1: 'True', 0: 'False', np.nan: 'No response'})).copy()
features_df[null_object_cols] = features_df[null_object_cols].fillna("No response").copy()
features_df.head()

int_object_cols = []
int_object_cols_medians = []
for each in features_df.columns.tolist():
    df_each = features_df[each]
    type_col = df_each.dtype
    if type_col == float or type_col == int and each not in ['respondent_id']:
        if df_each.isna().any():
            int_object_cols.append(each)
            median_col = df_each.median()
            int_object_cols_medians.append(median_col)
            
features_df_fin = features_df.copy()
for column, median in zip(int_object_cols, int_object_cols_medians):
    features_df_fin[column].fillna(median, inplace=True)
features_df_fin.head()

features_df_fin.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 53415 entries, 0 to 26707
Data columns (total 37 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   respondent_id                53415 non-null  int64  
 1   h1n1_concern                 53415 non-null  float64
 2   h1n1_knowledge               53415 non-null  float64
 3   behavioral_antiviral_meds    53415 non-null  object 
 4   behavioral_avoidance         53415 non-null  object 
 5   behavioral_face_mask         53415 non-null  object 
 6   behavioral_wash_hands        53415 non-null  object 
 7   behavioral_large_gatherings  53415 non-null  object 
 8   behavioral_outside_home      53415 non-null  object 
 9   behavioral_touch_face        53415 non-null  object 
 10  doctor_recc_h1n1             53415 non-null  object 
 11  doctor_recc_seasonal         53415 non-null  object 
 12  chronic_med_condition        53415 non-null  object 
 13  child_under_6_months         53415 non-null  object 
 14  health_worker                53415 non-null  object 
 15  health_insurance             53415 non-null  object 
 16  opinion_h1n1_vacc_effective  53415 non-null  float64
 17  opinion_h1n1_risk            53415 non-null  float64
 18  opinion_h1n1_sick_from_vacc  53415 non-null  float64
 19  opinion_seas_vacc_effective  53415 non-null  float64
 20  opinion_seas_risk            53415 non-null  float64
 21  opinion_seas_sick_from_vacc  53415 non-null  float64
 22  age_group                    53415 non-null  object 
 23  education                    53415 non-null  object 
 24  race                         53415 non-null  object 
 25  sex                          53415 non-null  object 
 26  income_poverty               53415 non-null  object 
 27  marital_status               53415 non-null  object 
 28  rent_or_own                  53415 non-null  object 
 29  employment_status            53415 non-null  object 
 30  hhs_geo_region               53415 non-null  object 
 31  census_msa                   53415 non-null  object 
 32  household_adults             53415 non-null  float64
 33  household_children           53415 non-null  float64
 34  employment_industry          53415 non-null  object 
 35  employment_occupation        53415 non-null  object 
 36  feat                         53415 non-null  object 
dtypes: float64(10), int64(1), object(26)
memory usage: 15.5+ MB

labels_df['Y'] = np.where(
    labels_df['VaccineCategory'] == "Both Vaccines", 1, 0
)
labels_df['One_vaccine_only'] = np.where(
    labels_df['VaccineCategory'].isin(["Seasonal Only","H1N1 Only"]), True, False
)
labels_df.head()

df_vaccine_categ2 = labels_df.groupby('Y')[
    'respondent_id'].nunique().reset_index(name='N').sort_values(by="N").reset_index(drop=True)
df_vaccine_categ2

final_df = pd.merge(
    features_df_fin, 
    labels_df[['respondent_id','Y']], 
    on=['respondent_id'], 
    how='left'
)
final_df['Y'] = final_df['Y'].astype(object)
final_df = final_df.drop(columns=['respondent_id'])
final_df.head()

final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 53415 entries, 0 to 53414
Data columns (total 37 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   h1n1_concern                 53415 non-null  float64
 1   h1n1_knowledge               53415 non-null  float64
 2   behavioral_antiviral_meds    53415 non-null  object 
 3   behavioral_avoidance         53415 non-null  object 
 4   behavioral_face_mask         53415 non-null  object 
 5   behavioral_wash_hands        53415 non-null  object 
 6   behavioral_large_gatherings  53415 non-null  object 
 7   behavioral_outside_home      53415 non-null  object 
 8   behavioral_touch_face        53415 non-null  object 
 9   doctor_recc_h1n1             53415 non-null  object 
 10  doctor_recc_seasonal         53415 non-null  object 
 11  chronic_med_condition        53415 non-null  object 
 12  child_under_6_months         53415 non-null  object 
 13  health_worker                53415 non-null  object 
 14  health_insurance             53415 non-null  object 
 15  opinion_h1n1_vacc_effective  53415 non-null  float64
 16  opinion_h1n1_risk            53415 non-null  float64
 17  opinion_h1n1_sick_from_vacc  53415 non-null  float64
 18  opinion_seas_vacc_effective  53415 non-null  float64
 19  opinion_seas_risk            53415 non-null  float64
 20  opinion_seas_sick_from_vacc  53415 non-null  float64
 21  age_group                    53415 non-null  object 
 22  education                    53415 non-null  object 
 23  race                         53415 non-null  object 
 24  sex                          53415 non-null  object 
 25  income_poverty               53415 non-null  object 
 26  marital_status               53415 non-null  object 
 27  rent_or_own                  53415 non-null  object 
 28  employment_status            53415 non-null  object 
 29  hhs_geo_region               53415 non-null  object 
 30  census_msa                   53415 non-null  object 
 31  household_adults             53415 non-null  float64
 32  household_children           53415 non-null  float64
 33  employment_industry          53415 non-null  object 
 34  employment_occupation        53415 non-null  object 
 35  feat                         53415 non-null  object 
 36  Y                            26707 non-null  object 
dtypes: float64(10), object(27)
memory usage: 15.5+ MB

correlation_matrix_ = final_df.corr()

C:\Users\kevin\AppData\Local\Temp\ipykernel_25912\3626133691.py:1: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
  correlation_matrix_ = final_df.corr()

threshold = 0.70
corr_pairs = correlation_matrix_.abs().unstack().sort_values(kind="quicksort", ascending=False)
filtered_pairs = corr_pairs[(corr_pairs >= threshold) & (corr_pairs < 1)]
filtered_pairs

Series([], dtype: float64)

print("Highly correlated pairs:")
for index, value in filtered_pairs.items():
    print(f"{index[0]} and {index[1]}: {value}")

Highly correlated pairs:

y_col = final_df[['Y']].columns.values.tolist()
feat_col = ['feat']
numerical_cols = final_df.columns[
    ((~final_df.dtypes.isin(['object','category']))&(~final_df.columns.isin(y_col+feat_col)))
].values.tolist()
categorical_cols = final_df.columns[
    ((final_df.dtypes.isin(['object','category']))&(~final_df.columns.isin(y_col+feat_col)))
].values.tolist()

numeric_scaled = pd.DataFrame(preprocessing.scale(np.asarray(final_df[numerical_cols])))
numeric_scaled.columns = numerical_cols
numeric_scaled#.head()

dummied_df = pd.get_dummies(final_df[categorical_cols], drop_first=True)
dummied_df.head()

X_df_ = pd.concat([numeric_scaled,dummied_df, final_df[['feat']]], axis=1)
X_df = X_df_[X_df_['feat']=='Train'].drop(columns='feat')
X_columns = X_df.columns.tolist()
Y_df = final_df[y_col][:len(X_df)].astype(int).astype('category').copy()
# Y_df#.head()

X = X_df.copy()
Y = Y_df.squeeze()

print(X.shape, Y.shape)

(26707, 113) (26707,)

n = n_features
kf = KFold(n_splits=n)
offset = 0.000000001
number_of_steps = 50
maxlambda = 2.5
step = maxlambda / number_of_steps
unique_lambdas = np.arange(0 + offset, maxlambda + step, step)
save_avg_coef = []
save_avg_intercept = []        
for the_lambda in unique_lambdas:   
    sum_coef = [0] * len(X_columns)
    sum_intercept = 0   
    for train_index, test_index in kf.split(X):
        X_train, X_test = X, X
        Y_train, Y_test = Y, Y    
        clf = linear_model.LogisticRegression(C = the_lambda, max_iter=1000)
        clf.fit(X_train, Y_train)
        for i in range(len(clf.coef_[0])):
            sum_coef[i] += clf.coef_[0][i]  
        sum_intercept += clf.intercept_            
    avg_coef = [0] * len(X_columns)    
    for i in range(len(sum_coef)):
        avg_coef[i] = sum_coef[i] / n    
    save_avg_coef.append(avg_coef)    
    avg_intercept = sum_intercept / n    
    save_avg_intercept.append(avg_intercept)

#LASSO PLOT, FULL
plt.figure(figsize=(14, 8))
ax = plt.gca()
ax.plot(unique_lambdas, save_avg_coef)
ax.set_xscale('log')
ax.set_xlim(ax.get_xlim()[::-1])
plt.xlabel('lambda')
plt.ylabel('weights')
plt.title('Lasso coefficients ALL')
plt.axis('tight')
plt.show()
pred_index = 38
display("PREDICTOR LAMBDA:", unique_lambdas[pred_index])

'PREDICTOR LAMBDA:'

1.9000000010000002

coeff_df = pd.DataFrame(columns=['ColName','Coeff'])
coeff_df['ColName'] = X_columns
coeff_df['Coeff'] = save_avg_coef[pred_index]
coeff_df['Magnitude'] = np.where(
    coeff_df['Coeff']<0, coeff_df['Coeff']*-1, coeff_df['Coeff']
)
coeff_df = coeff_df.sort_values(by='Magnitude',ascending=False)
coeff_df

chosen_predictors = coeff_df[:40].reset_index(drop=True)
chosen_predictors

positive_color = 'skyblue'
negative_color = 'salmon'
colors = [positive_color if coeff > 0 else negative_color for coeff in chosen_predictors['Coeff'].tolist()]
plt.figure(figsize=(13, 13))
bars = plt.barh(chosen_predictors['ColName'].tolist(), chosen_predictors['Coeff'].tolist(), color=colors)
plt.xlabel('Coefficient Value')
plt.ylabel('Feature')
plt.title('Coefficients of ML Model')
for bar, coeff in zip(bars,  chosen_predictors['Coeff'].tolist()):
    plt.text(bar.get_width() * 1.05, bar.get_y() + bar.get_height() / 2, f'{coeff:.2f}', ha='left', va='center')

plt.gca().invert_yaxis()
plt.show()

X_df_new = X_df[chosen_predictors['ColName'].tolist()]
x = np.asarray(X_df_new.copy())

y = np.asarray(Y)

X_train, X_test, y_train, y_test = train_test_split(x,y,test_size = 0.2 ,random_state = randomstate)

models = {
"Logit CV No Penalty": LogisticRegressionCV(random_state=randomstate, cv=cv, max_iter=5000),
"Logit CV L1": LogisticRegressionCV(penalty='l1', random_state=randomstate, cv=cv, solver='liblinear'),
"Logit CV L2": LogisticRegressionCV(penalty='l2', random_state=randomstate, cv=cv, solver='liblinear'),
"Logit CV ElasticNet": LogisticRegressionCV(penalty='elasticnet', random_state=randomstate, cv=cv, 
                                            solver='saga', l1_ratios=np.linspace(0.1,0.9,5), max_iter=5000),
"Logit No Penalty": LogisticRegression(random_state=randomstate, max_iter=5000),
"Logit L1": LogisticRegression(penalty='l1', solver='liblinear', random_state=randomstate),
"Logit L2": LogisticRegression(penalty='l2', solver='liblinear', random_state=randomstate),
# "Logit ElasticNet": LogisticRegression(penalty='elasticnet', solver='saga', random_state=randomstate, 
#                                        l1_ratios=np.linspace(0.1,0.9,5), max_iter=5000),
"SGD Classifier No Penalty": SGDClassifier(loss='log_loss', alpha=0.001, max_iter=1000, random_state=randomstate),
"SGD Classifier L1 Penalty": SGDClassifier(loss='log_loss', alpha=0.001, max_iter=1000, random_state=randomstate, penalty='l1'),
"SGD Classifier L2 Penalty": SGDClassifier(loss='log_loss', alpha=0.001, max_iter=1000, random_state=randomstate, penalty='l2'),
"SGD Classifier ElasticNet Penalty": SGDClassifier(
    loss='log_loss', alpha=0.0001, max_iter=1000, random_state=randomstate, penalty='elasticnet'),
"DecisionTree Classifier Entropy": DecisionTreeClassifier(criterion='entropy',min_samples_leaf=10, random_state=randomstate),
"DecisionTree Classifier Gini": DecisionTreeClassifier(criterion='gini',min_samples_leaf=10, random_state=randomstate),
"SVM RBF": svm.SVC(kernel='rbf', C=20, random_state=randomstate),
"SVM Poly": svm.SVC(kernel='poly', C=20, random_state=randomstate),
}
df_predictions = pd.DataFrame(columns=['Y','Yhat','Model'])
for each, model in models.items():
    print(f"Running {str(each)} model currently....")
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    model_predictions = pd.DataFrame()
    model_predictions['Y'] = y_test.tolist()
    model_predictions['Yhat'] = predictions.tolist()
    model_predictions['Model'] = str(each)
    df_predictions = pd.concat([df_predictions,model_predictions])

# print(df_predictions)

Running Logit CV No Penalty model currently....
Running Logit CV L1 model currently....
Running Logit CV L2 model currently....
Running Logit CV ElasticNet model currently....
Running Logit No Penalty model currently....
Running Logit L1 model currently....
Running Logit L2 model currently....
Running SGD Classifier No Penalty model currently....
Running SGD Classifier L1 Penalty model currently....
Running SGD Classifier L2 Penalty model currently....
Running SGD Classifier ElasticNet Penalty model currently....
Running DecisionTree Classifier Entropy model currently....
Running DecisionTree Classifier Gini model currently....
Running SVM RBF model currently....
Running SVM Poly model currently....

df_predictions['correct_pred'] = np.where(
    df_predictions['Y']==df_predictions['Yhat'], 'Correct Prediction', 'Incorrect Prediction'
)
df_predictions

correct_predictions = df_predictions.groupby(['Model','correct_pred']).size().reset_index(name='counts').sort_values(
    by=['counts','Model'])
correct_predictions

grouped_df = correct_predictions.groupby(['correct_pred', 'Model'])['counts'].sum().unstack().reset_index()
for col in grouped_df.columns[1:]:
    grouped_df[col] = grouped_df[col] / grouped_df[col].sum()

# Reshape the DataFrame for sorting
stacked_df = grouped_df.set_index('correct_pred').stack().reset_index(name='percentage')
# Sort by percentage values
stacked_df = stacked_df.sort_values(by='percentage', ascending=False)

# Reconstruct the DataFrame to have 'correct_pred' as columns
grouped_df_sorted = stacked_df.pivot_table(index='correct_pred', columns='Model', values='percentage').reset_index()

fig, ax = plt.subplots(figsize=(10, 8))

category_values = grouped_df_sorted.columns[1:]
colors = {'Incorrect Prediction': '#FF6347', 'Correct Prediction': '#32CD32'}

bar_positions = [i for i in range(len(category_values))]

for i, cat1 in enumerate(grouped_df_sorted['correct_pred'].unique()):
    values = grouped_df_sorted[grouped_df_sorted['correct_pred'] == cat1][category_values].values[0]
    if i == 0:
        left = None
    else:
        left = grouped_df_sorted[grouped_df_sorted['correct_pred'].isin(grouped_df_sorted['correct_pred'].unique()[:i])][category_values].sum().tolist()
    bars = ax.barh(bar_positions, values, left=left, label=cat1, color=colors[cat1])
    for j, (bar, value) in enumerate(zip(bars, values)):
        ax.text(bar.get_width() / 2 + (left[j] if left else 0), bar.get_y() + bar.get_height() / 2,
                f'{value*100:.2f}%', ha='center', va='center', color='white')

ax.set_yticks(bar_positions)
ax.set_yticklabels(category_values)
ax.set_xlabel('% of Predictions')
ax.set_title('Correct VS Incorrect Predictions by model', pad=20)
ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.15), ncol=2)
plt.tight_layout()
plt.show()

df_metrics = pd.DataFrame()
for each in df_predictions['Model'].unique().tolist():
    model_name = each
    df_model = df_predictions[df_predictions['Model']==each]
    precision, recall, f1_score, _ = precision_recall_fscore_support(df_model['Y'].tolist(), 
                                                                     df_model['Yhat'].tolist())
    metrics_df = pd.DataFrame({
        'Precision': precision,
        'Recall': recall,
        'F1-score': f1_score
    })
    metrics_df.index.name = 'Category'
    metrics_df.reset_index(inplace=True)
    metrics_df['Model'] = each
    df_metrics = pd.concat([df_metrics,metrics_df])
    
df_metrics['Category'] = np.where(
    df_metrics['Category'] == 1, "Vaccinated", "Not Vaccinated"
)
df_metrics

droput_metrics = df_metrics[df_metrics['Category']=='Vaccinated'].reset_index(drop=True)
droput_metrics.rename(columns={'Precision':'Vaccinated_Precision', 
                               'Recall':'Vaccinated_Recall',
                               'F1-score':'Vaccinated_F1'},inplace=True)
droput_metrics = droput_metrics.drop(columns='Category')
droput_metrics

nodroput_metrics = df_metrics[df_metrics['Category']=='Not Vaccinated'].reset_index(drop=True)
nodroput_metrics.rename(columns={'Precision':'Not_Vaccinated_Precision', 
                               'Recall':'Not_Vaccinated_Recall',
                               'F1-score':'Not_Vaccinated_F1'},inplace=True)
nodroput_metrics = nodroput_metrics.drop(columns='Category')
nodroput_metrics

metrics_df = pd.merge(nodroput_metrics, droput_metrics, on=['Model'])
metrics_df

models = metrics_df['Model'].tolist()
precision_dropout = metrics_df['Vaccinated_Precision'].tolist()
recall_dropout = metrics_df['Vaccinated_Recall'].tolist()
f_score_dropout = metrics_df['Vaccinated_F1'].tolist()

precision_not_dropout = metrics_df['Not_Vaccinated_Precision'].tolist()
recall_not_dropout = metrics_df['Not_Vaccinated_Recall'].tolist()
f_score_not_dropout = metrics_df['Not_Vaccinated_F1'].tolist()

# Create DataFrames for the data
data_dropout = {
    'Model': models,
    'Precision': precision_dropout,
    'Recall': recall_dropout,
    'F-score': f_score_dropout,
    'Category': 'Vaccinated'
}

data_not_dropout = {
    'Model': models,
    'Precision': precision_not_dropout,
    'Recall': recall_not_dropout,
    'F-score': f_score_not_dropout,
    'Category': 'Not Vaccinated'
}

df_dropout = pd.DataFrame(data_dropout)
df_not_dropout = pd.DataFrame(data_not_dropout)

# Concatenate the two DataFrames
df = pd.concat([df_dropout, df_not_dropout])

# Sort the DataFrame based on F-score values
df_sorted = df.sort_values(by='F-score', ascending=True)
# Melt the DataFrame for plotting
df_melted = df_sorted.melt(id_vars=['Model', 'Category'], var_name='Metric', value_name='Score')
# df_melted
# Plotting
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(14, 8), sharey=True)

for i, ax in enumerate(axes):
    if i == 0:
        title = 'Vaccinated'
    else:
        title = 'Not Vaccinated'
    sns.pointplot(data=df_melted[df_melted['Category'] == title], x='Score', y='Model', hue='Metric', dodge=True, join=False, markers=['o', 's', '^'], errorbar=None, ax=ax, linestyles='')
    ax.errorbar(x=df_melted[df_melted['Category'] == title]['Score'], y=df_melted[df_melted['Category'] == title]['Model'], xerr=np.std(df_melted[df_melted['Category'] == title]['Score']), fmt='none', color='black', zorder=-1)
    ax.set_xlabel('Score')
    ax.set_ylabel('')
    ax.set_title(title)
    ax.grid(True)
    ax.get_legend().remove()  # Remove legend from subplots
    ax.xaxis.grid(False)  # Remove horizontal lines connecting the points

# Consolidated legend
handles, labels = axes[1].get_legend_handles_labels()
fig.legend(handles, labels, loc='lower center', bbox_to_anchor=(0.5, -0.05), ncol=3, title='Metric')

plt.suptitle('Performance Metrics by Model and Vaccination', y=1.05)
plt.tight_layout()
plt.show()

X_train2, X_test2, y_train2, y_test2 = train_test_split(X,Y,test_size = 0.3 ,random_state = randomstate)

mdl = SGDClassifier(loss='log_loss', alpha=0.001, max_iter=1000, random_state=randomstate, penalty='l2')
mdl.fit(X_train2, y_train2)
BZ = mdl.coef_

# predict data and analyze result
Yh = mdl.predict(X)

confusion_matrix = pd.crosstab(Y, Yh,rownames=['Actual'], colnames=['Predicted'])
sns.heatmap(confusion_matrix, annot=True)
plt.show()

Stats = classification_report(Y, Yh)
print(Stats)

              precision    recall  f1-score   support

           0       0.89      0.97      0.92     22010
           1       0.72      0.43      0.54      4697

    accuracy                           0.87     26707
   macro avg       0.81      0.70      0.73     26707
weighted avg       0.86      0.87      0.86     26707

from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier

from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split

from sklearn.metrics import roc_curve, roc_auc_score

# chain preprocessing into a Pipeline object
# each step is a tuple of (name you chose, sklearn transformer)
numeric_preprocessing_steps = Pipeline([
    ('standard_scaler', StandardScaler()),
    ('simple_imputer', SimpleImputer(strategy='median'))
])

# create the preprocessor stage of final pipeline
# each entry in the transformer list is a tuple of
# (name you choose, sklearn transformer, list of columns)
preprocessor = ColumnTransformer(
    transformers = [
        ("numeric", numeric_preprocessing_steps, numerical_cols)
    ],
    remainder = "drop"
)

estimators = MultiOutputClassifier(
    estimator=SGDClassifier(loss='log_loss', alpha=0.001, max_iter=1000, random_state=randomstate, penalty='l2')
)

full_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("estimators", estimators),
])

X_train, X_eval, y_train, y_eval = train_test_split(
    features_df[features_df['feat']=='Train'].drop(columns=['feat']),
    labels_df.drop(columns=['VaccineCategory','Y','One_vaccine_only']),
    test_size=0.2,
    shuffle=True,
#     stratify=labels_df.drop(columns=['VaccineCategory','Y','One_vaccine_only']),
    random_state=randomstate
)

%%time

# Train model
full_pipeline.fit(X_train, y_train)

# Predict on evaluation set
# This competition wants probabilities, not labels
preds = full_pipeline.predict_proba(X_eval)
preds

print("test_probas[0].shape", preds[0].shape)
print("test_probas[1].shape", preds[1].shape)

y_preds = pd.DataFrame(
    {
        "h1n1_vaccine": preds[0][:, 1],
        "seasonal_vaccine": preds[1][:, 1],
    },
    index = y_eval.index
)
print("y_preds.shape:", y_preds.shape)
y_preds.head()

def plot_roc(y_true, y_score, label_name, ax):
    fpr, tpr, thresholds = roc_curve(y_true, y_score)
    ax.plot(fpr, tpr)
    ax.plot([0, 1], [0, 1], color='grey', linestyle='--')
    ax.set_ylabel('TPR')
    ax.set_xlabel('FPR')
    ax.set_title(
        f"{label_name}: AUC = {roc_auc_score(y_true, y_score):.4f}"
    )

fig, ax = plt.subplots(1, 2, figsize=(7, 3.5))

plot_roc(
    y_eval['h1n1_vaccine'], 
    y_preds['h1n1_vaccine'], 
    'h1n1_vaccine',
    ax=ax[0]
)
plot_roc(
    y_eval['seasonal_vaccine'], 
    y_preds['seasonal_vaccine'], 
    'seasonal_vaccine',
    ax=ax[1]
)
fig.tight_layout()

# roc_auc_score(y_eval, y_preds)

	respondent_id	h1n1_concern	h1n1_knowledge	behavioral_antiviral_meds	behavioral_avoidance	behavioral_face_mask	behavioral_wash_hands	behavioral_large_gatherings	behavioral_outside_home	behavioral_touch_face	doctor_recc_h1n1	doctor_recc_seasonal	chronic_med_condition	child_under_6_months	health_worker	health_insurance	opinion_h1n1_vacc_effective	opinion_h1n1_risk	opinion_h1n1_sick_from_vacc	opinion_seas_vacc_effective	opinion_seas_risk	opinion_seas_sick_from_vacc	household_adults	household_children
count	53415.000000	53238.000000	53177.000000	53265.000000	52994.000000	53377.000000	53333.000000	53256.000000	53251.000000	53159.000000	49095.000000	49095.000000	51512.000000	51782.000000	51822.000000	28913.000000	52626.000000	52647.000000	52645.000000	52501.000000	52402.000000	52357.000000	52941.000000	52941.000000
mean	26707.000000	1.620816	1.264287	0.049244	0.727705	0.069131	0.825849	0.355077	0.337271	0.680506	0.221489	0.331643	0.282148	0.084450	0.111709	0.883824	3.847623	2.334701	2.359141	4.025409	2.713923	2.130756	0.890406	0.539166
std	15419.726651	0.906534	0.616881	0.216380	0.445145	0.253679	0.379243	0.478541	0.472783	0.466285	0.415253	0.470808	0.450049	0.278064	0.315012	0.320442	1.007498	1.280608	1.361078	1.084875	1.380553	1.336077	0.753836	0.931626
min	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	0.000000	0.000000
25%	13353.500000	1.000000	1.000000	0.000000	0.000000	0.000000	1.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000	3.000000	1.000000	1.000000	4.000000	2.000000	1.000000	0.000000	0.000000
50%	26707.000000	2.000000	1.000000	0.000000	1.000000	0.000000	1.000000	0.000000	0.000000	1.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000	4.000000	2.000000	2.000000	4.000000	2.000000	2.000000	1.000000	0.000000
75%	40060.500000	2.000000	2.000000	0.000000	1.000000	0.000000	1.000000	1.000000	1.000000	1.000000	0.000000	1.000000	1.000000	0.000000	0.000000	1.000000	5.000000	4.000000	4.000000	5.000000	4.000000	4.000000	1.000000	1.000000
max	53414.000000	3.000000	2.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	5.000000	5.000000	5.000000	5.000000	5.000000	5.000000	3.000000	3.000000

	h1n1_concern	h1n1_knowledge	opinion_h1n1_vacc_effective	opinion_h1n1_risk	opinion_h1n1_sick_from_vacc	opinion_seas_vacc_effective	opinion_seas_risk	opinion_seas_sick_from_vacc	household_adults	household_children
0	-0.687156	-2.051332	-0.849713	-1.045526	-0.261828	-1.882737	-1.240387	-0.096883	-1.187639	-0.575314
1	1.522093	1.196734	1.149910	1.313000	1.217575	-0.023220	-0.510912	1.414956	-1.187639	-0.575314
2	-0.687156	-0.427299	-0.849713	-1.045526	-1.001529	-0.023220	-1.240387	-0.096883	1.477084	-0.575314
3	-0.687156	-0.427299	-0.849713	0.526825	1.957276	0.906538	0.948037	-0.852803	-1.187639	-0.575314
4	0.417468	-0.427299	-0.849713	0.526825	-0.261828	-0.952978	-1.240387	1.414956	0.144723	-0.575314
...	...	...	...	...	...	...	...	...	...	...
53410	-0.687156	-0.427299	0.150098	-0.259350	-0.261828	-0.023220	-0.510912	-0.852803	0.144723	0.501284
53411	1.522093	-0.427299	0.150098	-1.045526	-1.001529	0.906538	-0.510912	-0.096883	0.144723	2.654480
53412	-1.791781	-0.427299	0.150098	0.526825	-1.001529	-0.023220	0.218562	-0.852803	0.144723	-0.575314
53413	1.522093	-0.427299	-1.849525	0.526825	1.217575	-0.023220	0.218562	-0.096883	0.144723	-0.575314
53414	0.417468	-0.427299	1.149910	-1.045526	-0.261828	0.906538	0.948037	-0.096883	-1.187639	-0.575314

	behavioral_avoidance_True	behavioral_wash_hands_True	behavioral_large_gatherings_True	behavioral_outside_home_True	behavioral_touch_face_True	doctor_recc_h1n1_No response	doctor_recc_seasonal_No response	doctor_recc_seasonal_True	chronic_med_condition_True	health_insurance_No response	health_insurance_True	age_group_35 - 44 Years	age_group_45 - 54 Years	age_group_55 - 64 Years	age_group_65+ Years	education_< 12 Years	education_College Graduate	education_Some College	race_White	sex_Male	income_poverty_Below Poverty	marital_status_Not Married	rent_or_own_Own	rent_or_own_Rent	employment_status_Not in Labor Force	hhs_geo_region_bhuqouqj	...	hhs_geo_region_oxchjgsf	hhs_geo_region_qufhixun	census_msa_MSA, Principle City	census_msa_Non-MSA	employment_industry_pxcmvdjn	employment_industry_rucpziij	employment_industry_wxleyezf	employment_occupation_emcorrxb	employment_occupation_xgwztkwe	employment_occupation_xtkaffoo
0	0	0	0	1	1	0	0	0	0	0	1	0	0	1	0	1	0	0	1	0	1	1	1	0	1	0	...	1	0	0	1	0	0	0	0	0	0
1	1	1	0	1	1	0	0	0	0	0	1	1	0	0	0	0	0	0	1	1	1	1	0	1	0	1	...	0	0	0	0	1	0	0	0	1	0
2	1	0	0	0	0	1	1	0	1	1	0	0	0	0	0	0	1	0	1	1	0	1	1	0	0	0	...	0	1	0	0	0	1	0	0	0	1
3	1	1	1	0	0	0	0	1	1	1	0	0	0	0	1	0	0	0	1	0	1	1	0	1	1	0	...	0	0	1	0	0	0	0	0	0	0
4	1	1	1	0	1	0	0	0	0	1	0	0	1	0	0	0	0	1	1	0	0	0	1	0	0	0	...	0	1	0	0	0	0	1	1	0	0

	ColName	Coeff	Magnitude
25	doctor_recc_h1n1_True	1.548399	1.548399
74	employment_industry_haxffmxo	1.410665	1.410665
93	employment_occupation_dcjcmpih	1.410665	1.410665
10	behavioral_antiviral_meds_No response	0.882929	0.882929
34	health_insurance_No response	-0.860136	0.860136
...	...	...	...
8	household_adults	-0.034367	0.034367
0	h1n1_concern	-0.016748	0.016748
21	behavioral_outside_home_True	-0.011060	0.011060
92	employment_occupation_cmhcxjea	-0.006870	0.006870
13	behavioral_avoidance_True	-0.002703	0.002703

	ColName	Coeff	Magnitude
0	doctor_recc_h1n1_True	1.548399	1.548399
1	employment_industry_haxffmxo	1.410665	1.410665
2	employment_occupation_dcjcmpih	1.410665	1.410665
3	behavioral_antiviral_meds_No response	0.882929	0.882929
4	health_insurance_No response	-0.860136	0.860136
5	employment_occupation_qxajmpny	-0.847009	0.847009
6	health_worker_True	0.787104	0.787104
7	health_insurance_True	0.785535	0.785535
8	employment_occupation_tfqavkke	-0.725048	0.725048
9	employment_occupation_uqqtjvyb	-0.713886	0.713886
10	age_group_65+ Years	0.656117	0.656117
11	employment_industry_phxvnwax	0.584773	0.584773
12	employment_industry_arjwrbjb	0.570566	0.570566
13	employment_industry_xicduogh	0.551852	0.551852
14	employment_occupation_xgwztkwe	-0.538446	0.538446
15	age_group_55 - 64 Years	0.536462	0.536462
16	employment_industry_fcxhlnwr	0.535500	0.535500
17	race_Other or Multiple	0.533174	0.533174
18	employment_status_No response	0.518320	0.518320
19	education_No response	0.513174	0.513174
20	child_under_6_months_No response	0.492454	0.492454
21	race_White	0.492303	0.492303
22	opinion_h1n1_vacc_effective	0.490032	0.490032
23	behavioral_face_mask_No response	0.489136	0.489136
24	opinion_h1n1_risk	0.470053	0.470053
25	rent_or_own_Rent	-0.469763	0.469763
26	behavioral_outside_home_No response	-0.448110	0.448110
27	employment_industry_msuufmds	0.426944	0.426944
28	employment_industry_nduyfdeo	0.391660	0.391660
29	employment_occupation_oijqvulv	-0.386089	0.386089
30	employment_industry_wxleyezf	0.379052	0.379052
31	employment_occupation_pvmttkik	-0.377823	0.377823
32	race_Hispanic	0.353311	0.353311
33	employment_industry_saaquncn	0.343572	0.343572
34	rent_or_own_Own	-0.338613	0.338613
35	opinion_seas_risk	0.334938	0.334938
36	employment_occupation_mxkfnird	-0.318211	0.318211
37	employment_industry_vjjrobsf	-0.303148	0.303148
38	hhs_geo_region_dqpwygqj	-0.284656	0.284656
39	employment_occupation_rcertsgn	-0.283871	0.283871

H1N1 & Seasonal Vaccine Machine Learning Predictor

Task: Can you predict whether people got H1N1 and seasonal flu vaccines using information they shared about their backgrounds, opinions, and health behaviors?¶

Loading the data¶

Training Features¶

Training Labels¶

We observe above that the H1N1 Vaccine respondents data is highly imbalanced.¶

Independence of multilabel variables¶

This further proves dependency in such a way that most of the people who received H1N1 vaccine also received the Seasonal Vaccine¶

Data Consolidation¶

Final DF with labels and features joined.¶

Feature Corelation¶

Identify Pairs with Correlation within 0.70¶

Feature Preprocessing¶

Feature importance - Lasso regression¶

We are going to choose the top 20 and bottom 20 predictors by feature importance directionality / magnitude.¶

Final Model¶

SGDClassifier Model¶

Contact Me

Email

Location

	respondent_id	h1n1_concern	h1n1_knowledge	behavioral_avoidance	behavioral_wash_hands	behavioral_large_gatherings	behavioral_outside_home	behavioral_touch_face	doctor_recc_h1n1	doctor_recc_seasonal	chronic_med_condition	health_insurance	opinion_h1n1_vacc_effective	opinion_h1n1_risk	opinion_h1n1_sick_from_vacc	opinion_seas_vacc_effective	opinion_seas_risk	opinion_seas_sick_from_vacc	age_group	education	race	sex	income_poverty	marital_status	rent_or_own	employment_status	hhs_geo_region	census_msa	household_adults	employment_industry	employment_occupation	feat
0	0	1.0	0.0	0.0	0.0	0.0	1.0	1.0	0.0	0.0	0.0	1.0	3.0	1.0	2.0	2.0	1.0	2.0	55 - 64 Years	< 12 Years	White	Female	Below Poverty	Not Married	Own	Not in Labor Force	oxchjgsf	Non-MSA	0.0	NaN	NaN	Train
1	1	3.0	2.0	1.0	1.0	0.0	1.0	1.0	0.0	0.0	0.0	1.0	5.0	4.0	4.0	4.0	2.0	4.0	35 - 44 Years	12 Years	White	Male	Below Poverty	Not Married	Rent	Employed	bhuqouqj	MSA, Not Principle City	0.0	pxcmvdjn	xgwztkwe	Train
2	2	1.0	1.0	1.0	0.0	0.0	0.0	0.0	NaN	NaN	1.0	NaN	3.0	1.0	1.0	4.0	1.0	2.0	18 - 34 Years	College Graduate	White	Male	<= $75,000, Above Poverty	Not Married	Own	Employed	qufhixun	MSA, Not Principle City	2.0	rucpziij	xtkaffoo	Train
3	3	1.0	1.0	1.0	1.0	1.0	0.0	0.0	0.0	1.0	1.0	NaN	3.0	3.0	5.0	5.0	4.0	1.0	65+ Years	12 Years	White	Female	Below Poverty	Not Married	Rent	Not in Labor Force	lrircsnp	MSA, Principle City	0.0	NaN	NaN	Train
4	4	2.0	1.0	1.0	1.0	1.0	0.0	1.0	0.0	0.0	0.0	NaN	3.0	3.0	2.0	3.0	1.0	4.0	45 - 54 Years	Some College	White	Female	<= $75,000, Above Poverty	Married	Own	Employed	qufhixun	MSA, Not Principle City	1.0	wxleyezf	emcorrxb	Train

	respondent_id	h1n1_concern	h1n1_knowledge	behavioral_antiviral_meds	behavioral_avoidance	behavioral_face_mask	behavioral_wash_hands	behavioral_large_gatherings	behavioral_outside_home	behavioral_touch_face	doctor_recc_h1n1	doctor_recc_seasonal	chronic_med_condition	health_worker	health_insurance	opinion_h1n1_vacc_effective	opinion_h1n1_risk	opinion_h1n1_sick_from_vacc	opinion_seas_vacc_effective	opinion_seas_risk	opinion_seas_sick_from_vacc	age_group	education	race	sex	income_poverty	marital_status	rent_or_own	employment_status	hhs_geo_region	census_msa	household_adults	household_children	employment_industry	employment_occupation	feat
0	26707	2.0	2.0	0.0	1.0	0.0	1.0	1.0	0.0	1.0	0.0	0.0	0.0	0.0	1.0	5.0	1.0	1.0	5.0	1.0	1.0	35 - 44 Years	College Graduate	Hispanic	Female	> $75,000	Not Married	Rent	Employed	mlyzmhmf	MSA, Not Principle City	1.0	0.0	atmlpfrs	hfxkjkmi	Test
1	26708	1.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	4.0	1.0	1.0	4.0	1.0	1.0	18 - 34 Years	12 Years	White	Male	Below Poverty	Not Married	Rent	Employed	bhuqouqj	Non-MSA	3.0	0.0	atmlpfrs	xqwwgdyp	Test
2	26709	2.0	2.0	0.0	0.0	1.0	1.0	1.0	1.0	1.0	0.0	0.0	0.0	0.0	NaN	5.0	4.0	2.0	5.0	4.0	4.0	55 - 64 Years	College Graduate	White	Male	> $75,000	Married	Own	Employed	lrircsnp	Non-MSA	1.0	0.0	nduyfdeo	pvmttkik	Test
3	26710	1.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	1.0	1.0	0.0	1.0	4.0	2.0	2.0	4.0	4.0	2.0	65+ Years	12 Years	White	Female	<= $75,000, Above Poverty	Married	Own	Not in Labor Force	lrircsnp	MSA, Not Principle City	1.0	0.0	NaN	NaN	Test
4	26711	3.0	1.0	1.0	1.0	0.0	1.0	1.0	1.0	1.0	0.0	0.0	0.0	1.0	1.0	5.0	2.0	4.0	4.0	4.0	2.0	35 - 44 Years	12 Years	Black	Female	<= $75,000, Above Poverty	Not Married	Own	Employed	lzgpxyit	Non-MSA	0.0	1.0	fcxhlnwr	mxkfnird	Test

	respondent_id	h1n1_concern	h1n1_knowledge	behavioral_antiviral_meds	behavioral_avoidance	behavioral_face_mask	behavioral_wash_hands	behavioral_large_gatherings	behavioral_outside_home	behavioral_touch_face	doctor_recc_h1n1	doctor_recc_seasonal	chronic_med_condition	child_under_6_months	health_worker	health_insurance	opinion_h1n1_vacc_effective	opinion_h1n1_risk	opinion_h1n1_sick_from_vacc	opinion_seas_vacc_effective	opinion_seas_risk	opinion_seas_sick_from_vacc	age_group	education	race	sex	income_poverty	marital_status	rent_or_own	employment_status	hhs_geo_region	census_msa	household_adults	employment_industry	employment_occupation	feat
0	0	1.0	0.0	False	False	False	False	False	True	True	False	False	False	False	False	True	3.0	1.0	2.0	2.0	1.0	2.0	55 - 64 Years	< 12 Years	White	Female	Below Poverty	Not Married	Own	Not in Labor Force	oxchjgsf	Non-MSA	0.0	No response	No response	Train
1	1	3.0	2.0	False	True	False	True	False	True	True	False	False	False	False	False	True	5.0	4.0	4.0	4.0	2.0	4.0	35 - 44 Years	12 Years	White	Male	Below Poverty	Not Married	Rent	Employed	bhuqouqj	MSA, Not Principle City	0.0	pxcmvdjn	xgwztkwe	Train
2	2	1.0	1.0	False	True	False	False	False	False	False	No response	No response	True	False	False	No response	3.0	1.0	1.0	4.0	1.0	2.0	18 - 34 Years	College Graduate	White	Male	<= $75,000, Above Poverty	Not Married	Own	Employed	qufhixun	MSA, Not Principle City	2.0	rucpziij	xtkaffoo	Train
3	3	1.0	1.0	False	True	False	True	True	False	False	False	True	True	False	False	No response	3.0	3.0	5.0	5.0	4.0	1.0	65+ Years	12 Years	White	Female	Below Poverty	Not Married	Rent	Not in Labor Force	lrircsnp	MSA, Principle City	0.0	No response	No response	Train
4	4	2.0	1.0	False	True	False	True	True	False	True	False	False	False	False	False	No response	3.0	3.0	2.0	3.0	1.0	4.0	45 - 54 Years	Some College	White	Female	<= $75,000, Above Poverty	Married	Own	Employed	qufhixun	MSA, Not Principle City	1.0	wxleyezf	emcorrxb	Train

	respondent_id	seasonal_vaccine	VaccineCategory	One_vaccine_only
0	0	0	No Vaccine	False
1	1	1	H1N1 Only	True
2	2	0	No Vaccine	False
3	3	1	H1N1 Only	True
4	4	0	No Vaccine	False

	behavioral_avoidance_True	behavioral_wash_hands_True	behavioral_large_gatherings_True	behavioral_outside_home_True	behavioral_touch_face_True	doctor_recc_h1n1_No response	doctor_recc_seasonal_No response	doctor_recc_seasonal_True	chronic_med_condition_True	health_insurance_No response	health_insurance_True	age_group_35 - 44 Years	age_group_45 - 54 Years	age_group_55 - 64 Years	age_group_65+ Years	education_< 12 Years	education_College Graduate	education_Some College	race_White	sex_Male	income_poverty_Below Poverty	marital_status_Not Married	rent_or_own_Own	rent_or_own_Rent	employment_status_Not in Labor Force	hhs_geo_region_bhuqouqj	...	hhs_geo_region_oxchjgsf	hhs_geo_region_qufhixun	census_msa_MSA, Principle City	census_msa_Non-MSA	employment_industry_pxcmvdjn	employment_industry_rucpziij	employment_industry_wxleyezf	employment_occupation_emcorrxb	employment_occupation_xgwztkwe	employment_occupation_xtkaffoo
0	0	0	0	1	1	0	0	0	0	0	1	0	0	1	0	1	0	0	1	0	1	1	1	0	1	0	...	1	0	0	1	0	0	0	0	0	0
1	1	1	0	1	1	0	0	0	0	0	1	1	0	0	0	0	0	0	1	1	1	1	0	1	0	1	...	0	0	0	0	1	0	0	0	1	0
2	1	0	0	0	0	1	1	0	1	1	0	0	0	0	0	0	1	0	1	1	0	1	1	0	0	0	...	0	1	0	0	0	1	0	0	0	1
3	1	1	1	0	0	0	0	1	1	1	0	0	0	0	1	0	0	0	1	0	1	1	0	1	1	0	...	0	0	1	0	0	0	0	0	0	0
4	1	1	1	0	1	0	0	0	0	1	0	0	1	0	0	0	0	1	1	0	0	0	1	0	0	0	...	0	1	0	0	0	0	1	1	0	0

	Y	Yhat	Model	correct_pred
0	0	0	Logit CV No Penalty	Correct Prediction
1	0	0	Logit CV No Penalty	Correct Prediction
2	0	0	Logit CV No Penalty	Correct Prediction
3	0	0	Logit CV No Penalty	Correct Prediction
4	0	0	Logit CV No Penalty	Correct Prediction
...	...	...	...	...
5337	1	0	SVM Poly	Incorrect Prediction
5338	0	0	SVM Poly	Correct Prediction
5339	0	1	SVM Poly	Incorrect Prediction
5340	0	0	SVM Poly	Correct Prediction
5341	0	0	SVM Poly	Correct Prediction

	Model	correct_pred	counts
7	Logit CV L1	Incorrect Prediction	691
9	Logit CV L2	Incorrect Prediction	691
11	Logit CV No Penalty	Incorrect Prediction	692
17	Logit No Penalty	Incorrect Prediction	692
15	Logit L2	Incorrect Prediction	693
13	Logit L1	Incorrect Prediction	694
23	SGD Classifier L2 Penalty	Incorrect Prediction	695
25	SGD Classifier No Penalty	Incorrect Prediction	695
5	Logit CV ElasticNet	Incorrect Prediction	698
27	SVM Poly	Incorrect Prediction	706
21	SGD Classifier L1 Penalty	Incorrect Prediction	707
1	DecisionTree Classifier Entropy	Incorrect Prediction	719
3	DecisionTree Classifier Gini	Incorrect Prediction	720
29	SVM RBF	Incorrect Prediction	724
19	SGD Classifier ElasticNet Penalty	Incorrect Prediction	734
18	SGD Classifier ElasticNet Penalty	Correct Prediction	4608
28	SVM RBF	Correct Prediction	4618
2	DecisionTree Classifier Gini	Correct Prediction	4622
0	DecisionTree Classifier Entropy	Correct Prediction	4623
20	SGD Classifier L1 Penalty	Correct Prediction	4635
26	SVM Poly	Correct Prediction	4636
4	Logit CV ElasticNet	Correct Prediction	4644
22	SGD Classifier L2 Penalty	Correct Prediction	4647
24	SGD Classifier No Penalty	Correct Prediction	4647
12	Logit L1	Correct Prediction	4648
14	Logit L2	Correct Prediction	4649
10	Logit CV No Penalty	Correct Prediction	4650
16	Logit No Penalty	Correct Prediction	4650
6	Logit CV L1	Correct Prediction	4651
8	Logit CV L2	Correct Prediction	4651

	Category	Precision	Recall	F1-score	Model
0	Not Vaccinated	0.891222	0.960616	0.924619	Logit CV No Penalty
1	Vaccinated	0.700000	0.439394	0.539894	Logit CV No Penalty
0	Not Vaccinated	0.891245	0.960842	0.924736	Logit CV L1
1	Vaccinated	0.701209	0.439394	0.540253	Logit CV L1
0	Not Vaccinated	0.891245	0.960842	0.924736	Logit CV L2
1	Vaccinated	0.701209	0.439394	0.540253	Logit CV L2
0	Not Vaccinated	0.888796	0.962426	0.924147	Logit CV ElasticNet
1	Vaccinated	0.702509	0.424242	0.529015	Logit CV ElasticNet
0	Not Vaccinated	0.891222	0.960616	0.924619	Logit No Penalty
1	Vaccinated	0.700000	0.439394	0.539894	Logit No Penalty
0	Not Vaccinated	0.891176	0.960163	0.924384	Logit L1
1	Vaccinated	0.697595	0.439394	0.539177	Logit L1
0	Not Vaccinated	0.891199	0.960389	0.924502	Logit L2
1	Vaccinated	0.698795	0.439394	0.539535	Logit L2
0	Not Vaccinated	0.886123	0.966953	0.924775	SGD Classifier No Penalty
1	Vaccinated	0.719770	0.405844	0.519031	SGD Classifier No Penalty
0	Not Vaccinated	0.885038	0.965369	0.923460	SGD Classifier L1 Penalty
1	Vaccinated	0.707457	0.400433	0.511403	SGD Classifier L1 Penalty
0	Not Vaccinated	0.886123	0.966953	0.924775	SGD Classifier L2 Penalty
1	Vaccinated	0.719770	0.405844	0.519031	SGD Classifier L2 Penalty
0	Not Vaccinated	0.909698	0.925758	0.917658	SGD Classifier ElasticNet Penalty
1	Vaccinated	0.612293	0.560606	0.585311	SGD Classifier ElasticNet Penalty
0	Not Vaccinated	0.891926	0.952694	0.921309	DecisionTree Classifier Entropy
1	Vaccinated	0.664526	0.448052	0.535229	DecisionTree Classifier Entropy
0	Not Vaccinated	0.891074	0.953599	0.921277	DecisionTree Classifier Gini
1	Vaccinated	0.666124	0.442641	0.531860	DecisionTree Classifier Gini
0	Not Vaccinated	0.889662	0.954504	0.920943	SVM RBF
1	Vaccinated	0.666113	0.433983	0.525557	SVM RBF
0	Not Vaccinated	0.892887	0.954731	0.922774	SVM Poly
1	Vaccinated	0.676375	0.452381	0.542153	SVM Poly

	behavioral_avoidance_True	behavioral_wash_hands_True	behavioral_large_gatherings_True	behavioral_outside_home_True	behavioral_touch_face_True	doctor_recc_h1n1_No response	doctor_recc_seasonal_No response	doctor_recc_seasonal_True	chronic_med_condition_True	health_insurance_No response	health_insurance_True	age_group_35 - 44 Years	age_group_45 - 54 Years	age_group_55 - 64 Years	age_group_65+ Years	education_< 12 Years	education_College Graduate	education_Some College	race_White	sex_Male	income_poverty_Below Poverty	marital_status_Not Married	rent_or_own_Own	rent_or_own_Rent	employment_status_Not in Labor Force	hhs_geo_region_bhuqouqj	...	hhs_geo_region_oxchjgsf	hhs_geo_region_qufhixun	census_msa_MSA, Principle City	census_msa_Non-MSA	employment_industry_pxcmvdjn	employment_industry_rucpziij	employment_industry_wxleyezf	employment_occupation_emcorrxb	employment_occupation_xgwztkwe	employment_occupation_xtkaffoo
0	0	0	0	1	1	0	0	0	0	0	1	0	0	1	0	1	0	0	1	0	1	1	1	0	1	0	...	1	0	0	1	0	0	0	0	0	0
1	1	1	0	1	1	0	0	0	0	0	1	1	0	0	0	0	0	0	1	1	1	1	0	1	0	1	...	0	0	0	0	1	0	0	0	1	0
2	1	0	0	0	0	1	1	0	1	1	0	0	0	0	0	0	1	0	1	1	0	1	1	0	0	0	...	0	1	0	0	0	1	0	0	0	1
3	1	1	1	0	0	0	0	1	1	1	0	0	0	0	1	0	0	0	1	0	1	1	0	1	1	0	...	0	0	1	0	0	0	0	0	0	0
4	1	1	1	0	1	0	0	0	0	1	0	0	1	0	0	0	0	1	1	0	0	0	1	0	0	0	...	0	1	0	0	0	0	1	1	0	0

	behavioral_avoidance_True	behavioral_wash_hands_True	behavioral_large_gatherings_True	behavioral_outside_home_True	behavioral_touch_face_True	doctor_recc_h1n1_No response	doctor_recc_seasonal_No response	doctor_recc_seasonal_True	chronic_med_condition_True	health_insurance_No response	health_insurance_True	age_group_35 - 44 Years	age_group_45 - 54 Years	age_group_55 - 64 Years	age_group_65+ Years	education_< 12 Years	education_College Graduate	education_Some College	race_White	sex_Male	income_poverty_Below Poverty	marital_status_Not Married	rent_or_own_Own	rent_or_own_Rent	employment_status_Not in Labor Force	hhs_geo_region_bhuqouqj	...	hhs_geo_region_oxchjgsf	hhs_geo_region_qufhixun	census_msa_MSA, Principle City	census_msa_Non-MSA	employment_industry_pxcmvdjn	employment_industry_rucpziij	employment_industry_wxleyezf	employment_occupation_emcorrxb	employment_occupation_xgwztkwe	employment_occupation_xtkaffoo
0	0	0	0	1	1	0	0	0	0	0	1	0	0	1	0	1	0	0	1	0	1	1	1	0	1	0	...	1	0	0	1	0	0	0	0	0	0
1	1	1	0	1	1	0	0	0	0	0	1	1	0	0	0	0	0	0	1	1	1	1	0	1	0	1	...	0	0	0	0	1	0	0	0	1	0
2	1	0	0	0	0	1	1	0	1	1	0	0	0	0	0	0	1	0	1	1	0	1	1	0	0	0	...	0	1	0	0	0	1	0	0	0	1
3	1	1	1	0	0	0	0	1	1	1	0	0	0	0	1	0	0	0	1	0	1	1	0	1	1	0	...	0	0	1	0	0	0	0	0	0	0
4	1	1	1	0	1	0	0	0	0	1	0	0	1	0	0	0	0	1	1	0	0	0	1	0	0	0	...	0	1	0	0	0	0	1	1	0	0