import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier, Pool, cv, CatboostIpythonWidget
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel
rdn = 42
countries = ['A', 'B', 'C']
# download data (just once!)
# import wget
# url = 'https://s3.amazonaws.com/drivendata/data/50/public/'
# levels = ['hhold', 'indiv']
# types = ['train', 'test']
# wget.download(url + 'submission_format.csv', './input/submission_format.csv')
# [wget.download(f'{url}{x}_{y}_{z}.csv' , f'./input/{x}_{y}_{z}.csv') for x in countries for y in levels for z in types]
# read all data, separately! Because they all have different columns
hhld_train = [pd.read_csv(f'./input/{x}_hhold_train.csv', index_col=['id']) for x in countries]
hhld_test = [pd.read_csv(f'./input/{x}_hhold_test.csv', index_col=['id']) for x in countries]
indiv_train = [pd.read_csv(f'./input/{x}_indiv_train.csv', index_col=['iid', 'id']) for x in countries]
indiv_test = [pd.read_csv(f'./input/{x}_indiv_test.csv', index_col=['iid', 'id']) for x in countries]
template = pd.read_csv('./input/submission_format.csv', index_col=['id'])
# EDA for the train and test data
## Check for missing values
print([x.isnull().sum().sum() for x in hhld_train])
# print([x.isnull().sum().sum() for x in hhld_test])
## inspect poverty distribution
print([x.poor.value_counts() for x in hhld_train])
## inspect integers and range
print([ x.describe() for x in hhld_train])
## inspect number of categories (min and max)
print([np.max(x.iloc[:, np.where(x.dtypes == np.object)[0]].nunique()) for x in hhld_train])
train_b_na = hhld_train[1][hhld_train[1].columns[hhld_train[1].isnull().any()]]
print(train_b_na.info())
test_b_na = hhld_test[1][hhld_test[1].columns[hhld_test[1].isnull().any()]]
print(test_b_na.info())
The missing values in country B concentrate on 9 numeric columns. To deal with the NaNs, try following strategies:
def make_cv(name, train, n):
# TODO: scale numerics, drop single level columns, impute missing
X = train.drop(['poor', 'country'], axis=1)
y = train.poor
cat_ind = np.where(np.logical_and(X.dtypes != np.float, X.dtypes != np.int))[0]
pool = Pool(X.values, y.values, cat_features=cat_ind)
model = CatBoostClassifier(train_dir=f'models/{name}/', task_type='GPU', name=name, iterations=n, loss_function='Logloss', random_seed=rdn)
scores = cv(pool, model.get_params(), stratified=True, seed=rdn, logging_level='Verbose')
return scores
def model_train(name, train, n):
X = train.drop(['poor', 'country'], axis=1)
y = train.poor
cat_ind = np.where(np.logical_and(X.dtypes != np.float, X.dtypes != np.int))[0]
model = CatBoostClassifier(train_dir=f'models/{name}/', task_type='GPU', name=name, iterations=n, loss_function='Logloss', random_seed=rdn)
model.fit(X, y, cat_features=cat_ind, verbose=True)
return model
def pred_make(model, X_test, country):
df = pd.DataFrame()
df['id'] = X_test.index.get_level_values('id')
df['country'] = country
df['poor'] = model.predict_proba(X_test.drop('country', axis=1))[:,1]
return df
def mean_logloss(scores_a, scores_b, scores_c):
return np.average([np.min(scores_a['Logloss_test_avg']), np.min(scores_b['Logloss_test_avg']), np.min(scores_c['Logloss_test_avg'])], weights= np.array([x.country.shape[0] for x in hhld_test]))
# clean train_b and examine effects
## try 1: by dropna(axis=1)
scores_b = make_cv('hhld_dropna_b', hhld_train[1].dropna(axis=1), 600)
scores_a = make_cv('hhld_a', hhld_train[0], 2000)
scores_c = make_cv('hhld_c', hhld_train[2], 500)
scores_a.keys()
np.argmin(scores_b['Logloss_test_avg'])
# model_a = model_train('hhld_a', hhld_train[0], )
# test train
model_b = model_train('hhld_dropna_b', hhld_train[1].dropna(axis=1), np.argmin(scores_b['Logloss_test_avg']))
model_a = model_train('hhld_a', hhld_train[0], np.argmin(scores_a['Logloss_test_avg']))
model_c = model_train('hhld_c', hhld_train[2], np.argmin(scores_c['Logloss_test_avg']))
# predict and submit
submission = pd.concat([pred_make(x, y.dropna(axis=1), z) for x, y, z in zip([model_a, model_b, model_c], hhld_test, countries)], axis=0)
submission.to_csv('output/submission_b_dropna.csv', index=False)
# calculate score from cv
mean_logloss(scores_a, scores_b, scores_c)
## Check for missing values in indiv data
print([x.isnull().sum().sum() for x in indiv_train])
# print([x.isnull().sum().sum() for x in hhld_test])
## inspect poverty distribution
print([x.poor.value_counts() for x in indiv_train])
## inspect integers and range
print([ x.describe() for x in indiv_train])
## inspect number of categories (min and max)
print([np.max(x.iloc[:, np.where(x.dtypes == np.object)[0]].nunique()) for x in indiv_train])
# inspect NaN in indiv (train and test same columns of missing data)
[x.isnull().any().sum() for x in indiv_train]
# [x.isnull().any().sum() for x in indiv_test]
# DEBUG: leftjoin indiv to hhld on train A
# indiv_train[0].head()
# pd.Series(['A', 'A', 'B', 'C', 'C']).value_counts().index[0]
indiv_a_mean = indiv_train[0].loc[:,indiv_train[0].dtypes == np.object].groupby('id').agg(lambda x: x.value_counts().index[0])
indiv_a_mean.head()
# train_a_concat = pd.concat([hhld_train[0], indiv_a_mean])
indiv_train_dropna = [x.dropna(axis=1).drop(['poor', 'country'], axis=1) for x in indiv_train]
indiv_train_reduced = [pd.concat([x.loc[:, x.dtypes == np.object].groupby('id').agg(lambda x: x.value_counts().index[0]), x.loc[:, x.dtypes != np.object].groupby('id').agg('mean')], axis=1) for x in indiv_train_dropna]
indiv_test_dropna = [x.dropna(axis=1).drop(['country'], axis=1) for x in indiv_test]
indiv_test_reduced = [pd.concat([x.loc[:, x.dtypes == np.object].groupby('id').agg(lambda x: x.value_counts().index[0]), x.loc[:, x.dtypes != np.object].groupby('id').agg('mean')], axis=1) for x in indiv_test_dropna]
combined_train = [pd.concat([x.dropna(axis=1), y], axis=1) for x, y in zip(hhld_train, indiv_train_reduced)]
combined_test = [pd.concat([x.dropna(axis=1), y], axis=1) for x, y in zip(hhld_test, indiv_test_reduced)]
# indiv_train_reduced[0].head()
# hhld_train[0].head()
# combined_train[0].head()
print([x.isnull().any().sum() for x in hhld_train])
print([x.isnull().any().sum() for x in combined_train])
print([x.isnull().any().sum() for x in hhld_test])
print([x.isnull().any().sum() for x in combined_test])
combined_cvs = [make_cv(x, y, z) for x, y, z in zip(['combined_a', 'combined_b', 'combined_c'], combined_train, [2000, 600, 500])]
combined_models = [model_train(x, y, np.argmin(z['Logloss_test_avg'])) for x, y, z in zip(['combined_a', 'combined_b', 'combined_c'], combined_train, combined_cvs)]
submission_combined = pd.concat([pred_make(x, y, z) for x, y, z in zip(combined_models, combined_test, countries)], axis=0).set_index('id').reindex(template.index)
submission_combined.to_csv('output/submission_combined_dropna.csv', index=True)
mean_logloss(*combined_cvs)
# fillna flow for combined data
indiv_train_reduced = [pd.concat([x.loc[:, x.dtypes == np.object].groupby('id').agg(lambda x: x.value_counts().index[0]), x.loc[:, x.dtypes != np.object].groupby('id').agg('median')], axis=1) for x in [x.fillna(-99999).drop(['poor', 'country'], axis=1) for x in indiv_train]]
indiv_test_reduced = [pd.concat([x.loc[:, x.dtypes == np.object].groupby('id').agg(lambda x: x.value_counts().index[0]), x.loc[:, x.dtypes != np.object].groupby('id').agg('median')], axis=1) for x in [x.fillna(-99999).drop(['country'], axis=1) for x in indiv_test]]
combined_train = [pd.concat([x.fillna(-99999), y], axis=1) for x, y in zip(hhld_train, indiv_train_reduced)]
combined_test = [pd.concat([x.fillna(-99999), y], axis=1) for x, y in zip(hhld_test, indiv_test_reduced)]
combined_models = [model_train(x, y, np.argmin(z['Logloss_test_avg'])) for x, y, z in zip(['combined_a_fillna', 'combined_b_fillna', 'combined_c_fillna'], combined_train, combined_cvs)]
submission_combined = pd.concat([pred_make(x, y, z) for x, y, z in zip(combined_models, combined_test, countries)], axis=0).set_index('id').reindex(template.index)
submission_combined.to_csv('output/submission_combined_fillna_med.csv', index=True)
combined_cvs = [make_cv(x, y, z) for x, y, z in zip(['combined_a_fillna', 'combined_fillna', 'combined_fillna'], combined_train, [2000, 600, 500])]
mean_logloss(*combined_cvs)