Desert Povery Lab: Measuring Poverty One Country at a time

Roadmap

  1. Download the data(once)
  2. Inspect the data for each country on train and test sets
  3. Impute missing values
  4. Test baseline model
  5. Perform cross validation and feature selection
In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier, Pool, cv, CatboostIpythonWidget
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel
rdn = 42
countries = ['A', 'B', 'C']
In [4]:
# download data (just once!)
# import wget
# url = 'https://s3.amazonaws.com/drivendata/data/50/public/'
# levels = ['hhold', 'indiv']
# types = ['train', 'test']
# wget.download(url + 'submission_format.csv', './input/submission_format.csv')
# [wget.download(f'{url}{x}_{y}_{z}.csv' , f'./input/{x}_{y}_{z}.csv') for x in countries for y in levels for z in types]  
Out[4]:
'./input/submission_format.csv'
In [2]:
# read all data, separately! Because they all have different columns
hhld_train = [pd.read_csv(f'./input/{x}_hhold_train.csv', index_col=['id']) for x in countries]
hhld_test = [pd.read_csv(f'./input/{x}_hhold_test.csv', index_col=['id']) for x in countries]
indiv_train = [pd.read_csv(f'./input/{x}_indiv_train.csv', index_col=['iid', 'id']) for x in countries]
indiv_test = [pd.read_csv(f'./input/{x}_indiv_test.csv', index_col=['iid', 'id']) for x in countries]
template = pd.read_csv('./input/submission_format.csv', index_col=['id'])
In [73]:
# EDA for the train and test data
## Check for missing values
print([x.isnull().sum().sum() for x in hhld_train])
# print([x.isnull().sum().sum() for x in hhld_test])

## inspect poverty distribution
print([x.poor.value_counts() for x in hhld_train])

## inspect integers and range
print([ x.describe() for x in hhld_train])

## inspect number of categories (min and max)
print([np.max(x.iloc[:, np.where(x.dtypes == np.object)[0]].nunique()) for x in hhld_train])
[0, 16784, 0]
[False    4500
True     3703
Name: poor, dtype: int64, False    3004
True      251
Name: poor, dtype: int64, False    5496
True      973
Name: poor, dtype: int64]
[          nEsgxvAq     OMtioXZZ     YFMZwKrU     TiwRslOh
count  8203.000000  8203.000000  8203.000000  8203.000000
mean     -7.590638    17.464464    -2.985615    -4.191028
std       5.810942    10.853654     0.896245     4.472567
min     -70.000000  -127.000000    -4.000000   -31.000000
25%     -10.000000    12.000000    -4.000000    -7.000000
50%      -4.000000    12.000000    -3.000000    -3.000000
75%      -4.000000    21.000000    -2.000000    -1.000000
max      -4.000000   111.000000     1.000000     3.000000,           wJthinfa     ZvEApWrk     vuQrLzvK    FGWqGkmD     qrOrXLPM  \
count  3255.000000  3255.000000  3255.000000  602.000000  3255.000000   
mean     43.381260    96.040860    17.427343   -7.509967    22.203379   
std      22.728441   105.556895    72.057949    9.499141     6.962658   
min    -126.000000    -2.000000  -125.000000  -53.000000     8.000000   
25%      26.000000    33.000000   -39.000000  -13.000000    16.000000   
50%      42.000000    68.000000    27.000000   -8.000000    24.000000   
75%      58.000000   138.000000    77.000000    2.000000    24.000000   
max     122.000000  1069.000000   127.000000    2.000000    48.000000   

          BXOWgPgL    umkFMfvA     McFBIGsm     NjDdhqIe     rCVqiShm  \
count  2504.000000  890.000000  2504.000000  3255.000000  3255.000000   
mean    158.354633  -33.279775   301.106230    88.597849   -48.195392   
std     124.535287    8.231694   155.904844   107.268927    52.981575   
min     -40.000000  -63.000000   -43.000000    -7.000000  -968.000000   
25%      50.000000  -36.000000   185.000000    28.000000   -48.000000   
50%     150.000000  -36.000000   305.000000    63.000000   -28.000000   
75%     250.000000  -27.000000   425.000000    98.000000   -28.000000   
max     500.000000  -18.000000   605.000000  1253.000000    -8.000000   

          ...         IrxBnWxE     BRzuVmyf      dnlnKrAg     VyHofjLM  \
count     ...       272.000000  1794.000000    532.000000  3255.000000   
mean      ...         0.647059    45.675585 -15965.135338     1.974808   
std       ...         9.097690    41.675286     39.715899     1.565015   
min       ...       -61.000000     9.000000 -16047.000000    -2.000000   
25%       ...         3.000000    21.000000 -15999.000000     2.000000   
50%       ...         3.000000    36.000000 -15959.000000     2.000000   
75%       ...         3.000000    51.000000 -15927.000000     2.000000   
max       ...         3.000000   276.000000 -15911.000000     8.000000   

          GrLBZowF     oszSdLhD    aAufyreG     cDhZjxaW     OSmfjCbE  \
count  3255.000000  3255.000000  909.000000  3255.000000  2504.000000   
mean   -249.528111     0.670661   45.782178   -85.937020  -339.568291   
std     322.468103     1.833827   49.499821   114.537914   147.833796   
min   -5044.000000   -23.000000   -6.000000 -3639.000000  -506.000000   
25%    -364.000000     1.000000   12.000000  -119.000000  -501.000000   
50%    -184.000000     1.000000   39.000000   -59.000000  -356.000000   
75%     -64.000000     1.000000   48.000000   -39.000000  -256.000000   
max      -4.000000     1.000000  426.000000     1.000000    34.000000   

          IOMvIGQS  
count  3255.000000  
mean     78.568356  
std      63.123421  
min       0.000000  
25%      50.000000  
50%      50.000000  
75%     100.000000  
max     900.000000  

[8 rows x 23 columns],           LhUIIEHQ     PNAiwXUz     jmsRIiqp     NONtAKOM     kLAQgdly  \
count  6469.000000  6469.000000  6469.000000  6469.000000  6469.000000   
mean      7.899366    22.646932    10.876179    -6.674911  -298.008966   
std       1.482295    78.914475     9.164273    10.723440   286.793836   
min       7.000000    -9.000000     1.000000  -100.000000 -4150.000000   
25%       7.000000    -5.000000     1.000000   -10.000000  -430.000000   
50%       7.000000    -1.000000     8.000000    -4.000000  -250.000000   
75%       9.000000     7.000000    15.000000     2.000000   -10.000000   
max      35.000000   711.000000    85.000000     8.000000   -10.000000   

          WWuPOkor     CtFxPQPT     GIwNbAsH     qLDzvjiU     detlNNFh  \
count  6469.000000  6469.000000  6469.000000  6469.000000  6469.000000   
mean      3.427887   -68.832586    -3.166950     0.315659     2.667337   
std       2.999990   159.918843     6.366799     1.264827    28.862018   
min     -20.000000 -1611.000000   -66.000000    -1.000000    -1.000000   
25%       0.000000   -43.000000    -6.000000    -1.000000    -1.000000   
50%       5.000000   -15.000000    -3.000000     0.000000    -1.000000   
75%       5.000000    -8.000000     3.000000     1.000000    -1.000000   
max       5.000000    -1.000000     3.000000    11.000000  1394.000000   

          ...          kiAJBGqv     aFKPYcDt     gAZloxqF     phbxKGlB  \
count     ...       6469.000000  6469.000000  6469.000000  6469.000000   
mean      ...         -0.375947    -6.743392  -157.758695  -292.249652   
std       ...          3.084907     5.106988   258.817220   427.077633   
min       ...         -4.000000    -9.000000 -2699.000000 -3602.000000   
25%       ...         -3.000000    -9.000000  -179.000000  -425.000000   
50%       ...         -1.000000    -9.000000   -39.000000   -83.000000   
75%       ...          1.000000    -9.000000   -19.000000   -29.000000   
max       ...         30.000000    39.000000     1.000000    -2.000000   

          nTaJkLaJ     ZZGQNLOX     snkiwkvf     POJXrpmn     vSqQCatY  \
count  6469.000000  6469.000000  6469.000000  6469.000000  6469.000000   
mean     79.976503   -19.349668  -173.290153     0.904004   112.213789   
std     166.402429     5.017426   313.115779     2.415210   249.821599   
min      -2.000000   -55.000000 -2410.000000   -18.000000    -5.000000   
25%       5.000000   -19.000000  -160.000000     2.000000    -2.000000   
50%      19.000000   -19.000000   -40.000000     2.000000     4.000000   
75%      68.000000   -19.000000   -22.000000     2.000000    85.000000   
max    1748.000000   -13.000000   -10.000000     2.000000  1495.000000   

          mmoCpqWS  
count  6469.000000  
mean     12.788994  
std      58.638214  
min    -126.000000  
25%      -6.000000  
50%       1.000000  
75%      52.000000  
max     127.000000  

[8 rows x 30 columns]]
[31, 64, 85]
In [35]:
train_b_na = hhld_train[1][hhld_train[1].columns[hhld_train[1].isnull().any()]]
print(train_b_na.info())
test_b_na = hhld_test[1][hhld_test[1].columns[hhld_test[1].isnull().any()]]
print(test_b_na.info())
<class 'pandas.core.frame.DataFrame'>
Int64Index: 3255 entries, 57071 to 4923
Data columns (total 9 columns):
FGWqGkmD    602 non-null float64
BXOWgPgL    2504 non-null float64
umkFMfvA    890 non-null float64
McFBIGsm    2504 non-null float64
IrxBnWxE    272 non-null float64
BRzuVmyf    1794 non-null float64
dnlnKrAg    532 non-null float64
aAufyreG    909 non-null float64
OSmfjCbE    2504 non-null float64
dtypes: float64(9)
memory usage: 254.3 KB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1604 entries, 9135 to 52739
Data columns (total 9 columns):
FGWqGkmD    317 non-null float64
BXOWgPgL    1243 non-null float64
umkFMfvA    441 non-null float64
McFBIGsm    1243 non-null float64
IrxBnWxE    139 non-null float64
BRzuVmyf    881 non-null float64
dnlnKrAg    244 non-null float64
aAufyreG    428 non-null float64
OSmfjCbE    1245 non-null float64
dtypes: float64(9)
memory usage: 125.3 KB
None

The missing values in country B concentrate on 9 numeric columns. To deal with the NaNs, try following strategies:

  1. drop columns with NA
  2. set extreme value -99999
  3. impute central tendency (mean, median, etc)
In [1]:
def make_cv(name, train, n):
    # TODO: scale numerics, drop single level columns, impute missing
    X = train.drop(['poor', 'country'], axis=1)
    y = train.poor
    cat_ind = np.where(np.logical_and(X.dtypes != np.float, X.dtypes != np.int))[0]
    pool = Pool(X.values, y.values, cat_features=cat_ind)
    model = CatBoostClassifier(train_dir=f'models/{name}/', task_type='GPU', name=name, iterations=n, loss_function='Logloss', random_seed=rdn)
    scores = cv(pool, model.get_params(), stratified=True, seed=rdn, logging_level='Verbose')
    return scores

def model_train(name, train, n):
    X = train.drop(['poor', 'country'], axis=1)
    y = train.poor
    cat_ind = np.where(np.logical_and(X.dtypes != np.float, X.dtypes != np.int))[0]
    model = CatBoostClassifier(train_dir=f'models/{name}/', task_type='GPU', name=name, iterations=n, loss_function='Logloss', random_seed=rdn)
    model.fit(X, y, cat_features=cat_ind, verbose=True)
    return model

def pred_make(model, X_test, country):
    df = pd.DataFrame()
    df['id'] = X_test.index.get_level_values('id')
    df['country'] = country
    df['poor'] = model.predict_proba(X_test.drop('country', axis=1))[:,1]
    return df

def mean_logloss(scores_a, scores_b, scores_c):
    return np.average([np.min(scores_a['Logloss_test_avg']), np.min(scores_b['Logloss_test_avg']), np.min(scores_c['Logloss_test_avg'])], weights= np.array([x.country.shape[0] for x in hhld_test]))
In [ ]:
# clean train_b and examine effects
## try 1: by dropna(axis=1)
scores_b = make_cv('hhld_dropna_b', hhld_train[1].dropna(axis=1), 600)
In [ ]:
scores_a = make_cv('hhld_a', hhld_train[0], 2000)
In [ ]:
scores_c = make_cv('hhld_c', hhld_train[2], 500)
In [109]:
scores_a.keys()
np.argmin(scores_b['Logloss_test_avg'])

# model_a = model_train('hhld_a', hhld_train[0], )
Out[109]:
574
In [ ]:
# test train
model_b = model_train('hhld_dropna_b', hhld_train[1].dropna(axis=1), np.argmin(scores_b['Logloss_test_avg']))
In [ ]:
model_a = model_train('hhld_a', hhld_train[0], np.argmin(scores_a['Logloss_test_avg']))
In [ ]:
model_c = model_train('hhld_c', hhld_train[2], np.argmin(scores_c['Logloss_test_avg']))
In [116]:
# predict and submit
submission = pd.concat([pred_make(x, y.dropna(axis=1), z) for x, y, z in zip([model_a, model_b, model_c], hhld_test, countries)], axis=0)
submission.to_csv('output/submission_b_dropna.csv', index=False)
In [132]:
# calculate score from cv
mean_logloss(scores_a, scores_b, scores_c)
Out[132]:
0.17004743833106328

Incorporate indiv data to hhld

In [117]:
## Check for missing values in indiv data
print([x.isnull().sum().sum() for x in indiv_train])
# print([x.isnull().sum().sum() for x in hhld_test])

## inspect poverty distribution
print([x.poor.value_counts() for x in indiv_train])

## inspect integers and range
print([ x.describe() for x in indiv_train])

## inspect number of categories (min and max)
print([np.max(x.iloc[:, np.where(x.dtypes == np.object)[0]].nunique()) for x in indiv_train])
[6268, 461273, 0]
[True     19684
False    17876
Name: poor, dtype: int64, False    18375
True      1877
Name: poor, dtype: int64, False    22868
True      7045
Name: poor, dtype: int64]
[           OdXpbPGJ      ukWqmeSS
count  31292.000000  37560.000000
mean       8.719129    107.022764
std       21.089956     91.795117
min        4.000000      1.000000
25%        4.000000     36.000000
50%        4.000000     81.000000
75%        4.000000    151.000000
max      214.000000    551.000000,           BoxViLPz     qlLzyqpP     unRAgFtX      TJGiunYp    WmKLEUcd  \
count  5459.000000  1185.000000  1652.000000  12454.000000  354.000000   
mean    -34.296025   -38.275949   -87.351090      0.629758    1.161017   
std      18.357318    30.277305    89.976221      1.801028    3.992848   
min     -68.000000  -177.000000  -644.000000     -1.000000  -19.000000   
25%     -50.000000   -51.000000  -122.000000      0.000000   -1.000000   
50%     -32.000000   -33.000000   -50.000000      0.000000    2.000000   
75%     -20.000000    -9.000000   -23.000000      1.000000    5.000000   
max      -8.000000    -9.000000    -5.000000     16.000000    5.000000   

         DYgxQeEi    jfsTwowc      MGfpfHam     esHWAAyG    DtcKwIEv  \
count   48.000000  136.000000    438.000000  12804.00000  993.000000   
mean  -140.145833   58.367647 -20053.849315     -8.52757    7.221551   
std    166.546778   50.378392     14.162513      9.36105    1.687615   
min   -800.000000    0.000000 -20067.000000   -148.00000   -6.000000   
25%   -174.500000   18.000000 -20067.000000     -8.00000    7.000000   
50%    -95.000000   45.000000 -20057.000000     -8.00000    8.000000   
75%    -34.250000  108.000000 -20047.000000     -1.00000    8.000000   
max      0.000000  324.000000 -19897.000000     -1.00000    8.000000   

          ...         AJgudnHB      iZhWxnWa     fyfDnyQk      wJthinfa  \
count     ...        21.000000   2142.000000  1652.000000  20252.000000   
mean      ...        32.142857 -15965.735761     0.328692      0.452400   
std       ...        85.752572   1962.376212    10.110190      3.025749   
min       ...         4.000000 -80001.000000   -51.000000     -6.000000   
25%       ...         7.000000 -16001.000000     4.000000     -2.000000   
50%       ...         9.000000 -15945.000000     4.000000      0.000000   
75%       ...        18.000000 -15849.000000     4.000000      2.000000   
max       ...       403.000000 -15369.000000     4.000000     14.000000   

           nxAFXxLQ      mAeaImix    HZqPmvkr      ulQCDoYe     tzYvQeOb  \
count    361.000000  16524.000000  438.000000  20252.000000  2025.000000   
mean    1148.722992     -8.763738   21.547945    -21.382579  -680.751605   
std     2258.035086      3.230701   17.392132     67.795927   189.030118   
min      152.000000    -28.000000   -4.000000   -127.000000  -810.000000   
25%      602.000000     -8.000000    8.000000    -77.000000  -810.000000   
50%      902.000000     -8.000000   20.000000    -32.000000  -810.000000   
75%     1202.000000     -8.000000   32.000000     19.000000  -570.000000   
max    29999.000000     -8.000000   62.000000    125.000000   -10.000000   

          NfpXxGQk  
count  3804.000000  
mean  -7942.520505  
std      70.342966  
min   -8023.000000  
25%   -7995.000000  
50%   -7963.000000  
75%   -7911.000000  
max   -7675.000000  

[8 rows x 32 columns],            XKQWlRjk      vWNISgEA      bsMfXBld      XKyOwsRR      CgAkQtOd
count  29913.000000  29913.000000  29913.000000  29913.000000  29913.000000
mean       1.481831     22.294788      5.317193    196.691372    -14.949217
std       48.992077     31.615398      9.158482    280.885137     20.333932
min     -128.000000      9.000000    -69.000000     -3.000000   -325.367200
25%        1.000000      9.000000      9.000000     -3.000000     -7.000000
50%        1.000000      9.000000      9.000000     -3.000000     -7.000000
75%        1.000000      9.000000      9.000000    381.000000     -7.000000
max      125.000000    149.000000      9.000000    949.000000     -7.000000]
[25, 97, 62]
In [147]:
# inspect NaN in indiv (train and test same columns of missing data)
[x.isnull().any().sum() for x in indiv_train]
# [x.isnull().any().sum() for x in indiv_test]
Out[147]:
[1, 28, 0]
In [170]:
# DEBUG: leftjoin indiv to hhld on train A
# indiv_train[0].head()
# pd.Series(['A', 'A', 'B', 'C', 'C']).value_counts().index[0]
indiv_a_mean = indiv_train[0].loc[:,indiv_train[0].dtypes == np.object].groupby('id').agg(lambda x: x.value_counts().index[0])
indiv_a_mean.head()
# train_a_concat = pd.concat([hhld_train[0], indiv_a_mean])
Out[170]:
HeUgMnzF CaukPfUC MzEtIdUF gtnNTNam SWoXNmPc eXbOkwhI XONDGWjH KsFoQcUV qYRZCuJD FPQrjGnS ... XBldkztv tbgZsPXD qqVibbSA MgCoFhXK rFpoTXAq RXcLsVAQ rQWIpTiG XizJGmbu xqUooaNJ country
id
14 XJsPz mOlYV UFoKR HIvIU onRNG YXCNt ccbZA kpkiH fohru scxJu ... tbsMf yOwsR QQdHS uEstx Hikoa zQvdC xUYIC juMSt JTCKs A
18 XJsPz mOlYV axSTs CXizI onRNG YXCNt ccbZA HgfUG fohru scxJu ... XQevi yOwsR QQdHS uEstx Hikoa zQvdC xUYIC juMSt JTCKs A
36 XJsPz kzSFB axSTs CXizI onRNG YXCNt fOUHD HgfUG fohru HRGCq ... XQevi yOwsR QQdHS gCSRj Hikoa zQvdC rkLqZ juMSt JTCKs A
39 XJsPz mOlYV axSTs CXizI onRNG YXCNt fOUHD HgfUG fohru scxJu ... tbsMf yOwsR QQdHS uEstx Hikoa zQvdC rkLqZ juMSt JTCKs A
58 XJsPz mOlYV axSTs CXizI onRNG YXCNt fOUHD HgfUG fohru scxJu ... tbsMf yOwsR QQdHS uEstx Hikoa zQvdC rkLqZ FUUXv JTCKs A

5 rows × 39 columns

In [179]:
indiv_train_dropna = [x.dropna(axis=1).drop(['poor', 'country'], axis=1) for x in indiv_train]
indiv_train_reduced = [pd.concat([x.loc[:, x.dtypes == np.object].groupby('id').agg(lambda x: x.value_counts().index[0]), x.loc[:, x.dtypes != np.object].groupby('id').agg('mean')], axis=1) for x in indiv_train_dropna]
In [180]:
indiv_test_dropna = [x.dropna(axis=1).drop(['country'], axis=1) for x in indiv_test]
indiv_test_reduced = [pd.concat([x.loc[:, x.dtypes == np.object].groupby('id').agg(lambda x: x.value_counts().index[0]), x.loc[:, x.dtypes != np.object].groupby('id').agg('mean')], axis=1) for x in indiv_test_dropna]
In [210]:
combined_train = [pd.concat([x.dropna(axis=1), y], axis=1) for x, y in zip(hhld_train, indiv_train_reduced)]
combined_test = [pd.concat([x.dropna(axis=1), y], axis=1) for x, y in zip(hhld_test, indiv_test_reduced)]
In [211]:
# indiv_train_reduced[0].head()
# hhld_train[0].head()
# combined_train[0].head()
print([x.isnull().any().sum() for x in hhld_train])
print([x.isnull().any().sum() for x in combined_train])
print([x.isnull().any().sum() for x in hhld_test])
print([x.isnull().any().sum() for x in combined_test])
[0, 9, 0]
[0, 0, 0]
[0, 9, 0]
[0, 0, 0]
In [ ]:
combined_cvs = [make_cv(x, y, z) for x, y, z in zip(['combined_a', 'combined_b', 'combined_c'], combined_train, [2000, 600, 500])]
In [ ]:
combined_models = [model_train(x, y, np.argmin(z['Logloss_test_avg'])) for x, y, z in zip(['combined_a', 'combined_b', 'combined_c'], combined_train, combined_cvs)]
In [228]:
submission_combined = pd.concat([pred_make(x, y, z) for x, y, z in zip(combined_models, combined_test, countries)], axis=0).set_index('id').reindex(template.index)
submission_combined.to_csv('output/submission_combined_dropna.csv', index=True)
In [223]:
mean_logloss(*combined_cvs)
Out[223]:
0.1713750497289378
In [ ]:
# fillna flow for combined data
indiv_train_reduced = [pd.concat([x.loc[:, x.dtypes == np.object].groupby('id').agg(lambda x: x.value_counts().index[0]), x.loc[:, x.dtypes != np.object].groupby('id').agg('median')], axis=1) for x in [x.fillna(-99999).drop(['poor', 'country'], axis=1) for x in indiv_train]]
indiv_test_reduced = [pd.concat([x.loc[:, x.dtypes == np.object].groupby('id').agg(lambda x: x.value_counts().index[0]), x.loc[:, x.dtypes != np.object].groupby('id').agg('median')], axis=1) for x in [x.fillna(-99999).drop(['country'], axis=1) for x in indiv_test]]
combined_train = [pd.concat([x.fillna(-99999), y], axis=1) for x, y in zip(hhld_train, indiv_train_reduced)]
combined_test = [pd.concat([x.fillna(-99999), y], axis=1) for x, y in zip(hhld_test, indiv_test_reduced)]
combined_models = [model_train(x, y, np.argmin(z['Logloss_test_avg'])) for x, y, z in zip(['combined_a_fillna', 'combined_b_fillna', 'combined_c_fillna'], combined_train, combined_cvs)]
In [ ]:
submission_combined = pd.concat([pred_make(x, y, z) for x, y, z in zip(combined_models, combined_test, countries)], axis=0).set_index('id').reindex(template.index)
submission_combined.to_csv('output/submission_combined_fillna_med.csv', index=True)
In [ ]:
combined_cvs = [make_cv(x, y, z) for x, y, z in zip(['combined_a_fillna', 'combined_fillna', 'combined_fillna'], combined_train, [2000, 600, 500])]
In [236]:
mean_logloss(*combined_cvs)
Out[236]:
0.17273918684293196