Africa Soil Challenge

SOC, pH, Ca, P, Sand are the five target variables for predictions.

http://afsiskaggle.qed.ai/

In [11]:
#
# Standard imports including plotting
#
from __future__ import print_function, division
import os
import pandas as pd
import numpy as np
import logging
import multiprocessing
import platform

from joblib import Parallel, delayed  
from pprint import pprint
from collections import OrderedDict

from matplotlib import pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")
%matplotlib inline
## %config InlineBackend.figure_format = 'svg'
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 40)

def underline(astr): return "{}\n{}".format(astr, "-"*len(astr))

N_CPUS = multiprocessing.cpu_count() if platform.system() == 'Linux' else 5

print("Using platform {}; NCPUS {}".format(platform.system(), N_CPUS))
Using platform Darwin; NCPUS 5

In [27]:
#
# SOC, pH, Ca, P, Sand are the five target variables for predictions.
# Same order as sample_submission.csv
# http://afsiskaggle.qed.ai/
#
soil_properties = ["Ca", "P", "pH", "SOC", "Sand"]
training_data = pd.read_csv("training.csv")
test_data = pd.read_csv("sorted_test.csv")

print(underline("Training data info"))
print(training_data.info())
Training data info
------------------
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1157 entries, 0 to 1156
Columns: 3600 entries, PIDN to Sand
dtypes: float64(3598), object(2)None

In [23]:
#
# Turn training_data into X, y for sklearn
#
from sklearn.preprocessing import StandardScaler

def get_Xy(t_data):
    def dummify(X, column):
        dummies = pd.get_dummies(X[column], prefix=column)
        X[dummies.columns] = dummies
        X.drop(column, axis=1, inplace=True)
 
    X = t_data[[col for col in training_data.columns if col not in soil_properties]]
    X.set_index("PIDN", inplace=True)
    dummify(X, "Depth")
    
    ys = t_data[[col for col in t_data.columns if col in ["PIDN"] + soil_properties]]
    ys.set_index("PIDN", inplace=True)
    return X, ys

X, ys = get_Xy(training_data)
X_test, _ys = get_Xy(test_data)

orig_X, orig_ys = X.copy(), ys.copy()
orig_X_test, _orig_ys = X_test.copy(), _ys.copy()
assert(_ys.empty)
print(underline("X info"))
print(X.info())
X info
------
<class 'pandas.core.frame.DataFrame'>
Index: 1157 entries, XNhoFZW5 to A6IpBbfZ
Columns: 3595 entries, m7497.96 to Depth_Topsoil
dtypes: float64(3595)None

In [18]:
#
# Investigate P, by far the most difficult soil property to predict
# Maybe map P onto something else? Taking the log helps...
#
import scipy.stats
# there are some repeated values...
map_ranks = zip( sorted(ys["P"]), np.arange(len(ys["P"]))/len(ys["P"]) )
imap_ranks = [(j,i) for i,j in map_ranks]
#ys["Pp"] = [dict(map_ranks)[v] for v in ys["P"]]
(ys['P']/ys['P'].max()).hist(bins=300,figsize=(16,5))
#(ys['Pp']/ys['Pp'].max()).hist(bins=60,figsize=(27,5))
print("P distribution\nmax:{} mean:{} median:{}".format(ys["P"].max(), ys["P"].mean(), ys["P"].median()))
plt.savefig("hist.png")
P
max:13.2668405833 mean:-0.0145235521384 median:-0.269595130469

In [24]:
#
# Plot spectra
#
spectra_cols = [col for col in orig_X.columns if col.startswith("m")][::-1]
notspec_cols = [col for col in orig_X.columns if not col.startswith("m")]
spectra_nums = [float(s[1:]) for s in spectra_cols]
assert(spectra_nums == sorted(spectra_nums))
yy, xx = spectra_nums, orig_X[spectra_cols]
plt.figure(figsize=(27,9))
from random import random
for i in range(len(xx))[:10]:
    _c = [random(), random(), random(), .2]
    plt.scatter(yy,xx.ix[i], color=_c, alpha=0.1)
 
#
# Use an SVD here to look at the data structure
#
from sklearn import decomposition

def plot_svd(X):
    svd = decomposition.TruncatedSVD(n_components=3)
    d3 = svd.fit_transform(X)
    plt.figure(figsize=(15,15))
    
    # Show high and low P
    plt.figure(figsize=(7,7))
    d2 = d3[:,(0,1)]
    plt.scatter(*d2.T, color='grey')

    ixs = ys.ix[ys.ix[:,'P']>.5].index
    posns = [ys.index.tolist().index(i) for i in ixs]
    plt.scatter(*d2[posns].T, color='red')

    ixs = ys.ix[ys.ix[:,'P']<-4].index
    posns = [ys.index.tolist().index(i) for i in ixs]
    plt.scatter(*d2[posns].T, color='blue')

    # Show seaborn jointplots
    plt.figure(figsize=(5,5))
    for n, cols in enumerate([(0,1),(0,2),(1,2)]):
        d2 = d3[:,cols]
        assert(d2.T[0].shape == X.index.shape)
        with sns.axes_style("white"):
            sns.jointplot(d2.T[0], d2.T[1], kind="hex");
        continue

plot_svd(xx)
<matplotlib.figure.Figure at 0x10e602a50>
<matplotlib.figure.Figure at 0x10c683610>

Feature engineering

Add or modify features to see if I can see any improvement.

In [6]:
def add_cwt(_X):
    """continuous wavelet transform did not help"""
    from scipy import signal
    wavelet = signal.ricker
    widths = np.arange(1,4,3) # 1,4,7
    cols = ["cwt_{}_{}".format(s,w) for w in widths for s in spectra_cols]
    
    cwt_vals = []
    for i in range(len(_X)):
        cwt_vals.append( signal.cwt(_X.ix[i,spectra_cols], wavelet, widths) )
    
    flattened_cwt_vals = np.array([row.flatten() for row in cwt_vals])
    add_X = pd.DataFrame(flattened_cwt_vals, columns=cols, index=_X.index)
    return _X.join(add_X)

def add_peaks(_X):
    """simple peak-finding did not help"""
    cols = ["peak_{}".format(s) for s in spectra_cols[1:-1]]
    flattened_peaks = []
    for i,s in enumerate(spectra_cols):
        if i != 0 and i != len(spectra_cols)-1: 
            flattened_peaks.append(_X.ix[:,s] - (_X.ix[:,spectra_cols[i-1]] + _X.ix[:,spectra_cols[i+1]])/2)
    print(np.array(flattened_peaks).shape, len(cols), len(_X.index))
    add_X = pd.DataFrame(np.array(flattened_peaks).T, columns=cols, index=_X.index)
    return _X.join(add_X)

def add_cross(_X):
    """crossing some of the most predictive features may have helped a bit? 
    turned on for RFE=1000 by accident"""
    X = _X.copy()
    old_P_cols = ['m619.045', 'm887.105', 'm885.176', 'm617.116', 'm1560.15', 'm1130.09', 
              'm620.973', 'm615.188', 'm613.259', 'm889.033', 'm784.895', 'm782.966', 'm1029.81', 
              'm883.248', 'm1132.02', 'm1031.74', 'm873.605']
    P_cols = [c for c in _X.columns if not str(c).startswith("m")]
    cols = [c for c in _X.columns if c in P_cols]
    for i in range(len(cols)):
        for j in range(i,len(cols)):
            X[cols[i]+"_"+cols[j]] = X[cols[i]]*X[cols[j]]
    return X

def log_transform_col(series):
    """log transform helps for P"""
    return np.log(series - np.min(series) + 0.01), np.min(series)

def unlog_transform_col(series, min_series):
    _series = np.exp(series)
    return _series - 0.01 + min_series

def add_log(_X):
    """crossing some of the most predictive features may have helped a bit? 
    turned on for RFE=1000 by accident"""
    X = _X.copy()
    cols = [c for c in _X.columns if not c.startswith("m") and "_" not in c]
    for col in cols:
        X["log_"+col], _ = log_transform_col(X[col])
    return X

def sg_row(y, window_size=5, order=2, deriv=0, rate=1):
    """Savitzky Golay did not help"""
    from math import factorial

    try:
        window_size = np.abs(np.int(window_size))
        order = np.abs(np.int(order))
    except ValueError, msg:
        raise ValueError("window_size and order have to be of type int")
    if window_size % 2 != 1 or window_size < 1:
        raise TypeError("window_size size must be a positive odd number")
    if window_size < order + 2:
        raise TypeError("window_size is too small for the polynomials order")
    order_range = range(order+1)
    half_window = (window_size -1) // 2
    # precompute coefficients
    b = np.mat([[k**i for i in order_range] for k in range(-half_window, half_window+1)])
    m = np.linalg.pinv(b).A[deriv] * rate**deriv * factorial(deriv)
    # pad the signal at the extremes with
    # values taken from the signal itself
    firstvals = y[0] - np.abs( y[1:half_window+1][::-1] - y[0] )
    lastvals = y[-1] + np.abs(y[-half_window-1:-1][::-1] - y[-1])
    y = np.concatenate((firstvals, y, lastvals))
    return np.convolve( m[::-1], y, mode='valid')


# Log transforming P seems to help
ys = orig_ys.copy()
ys.ix[:,'P'], min_P = log_transform_col(orig_ys.ix[:,'P'])

# Adding the crosses for non-spectra columns seems to help
X = add_log(add_cross(orig_X))
X_test = add_log(add_cross(orig_X_test))

# Scale the crosses and the logs
for column in [c for c in X.columns if "_" in c]:
    fit = StandardScaler().fit(X[column])        
    X[column] = fit.transform(X[column])
    X_test[column] = fit.transform(X_test[column])

print(underline("X shape and columns"))
print(X.shape)
print(X.ix[:3,::300])
X shape and columns
-------------------
(1157, 3763)
          m7497.96  m6919.42  m6340.87  m5762.32  m5183.78  m4605.23  \
PIDN                                                                   
XNhoFZW5  0.302553  0.315237  0.301227  0.303091  0.386926  0.395694   
9XNspFTd  0.270192  0.292890  0.275229  0.280140  0.386313  0.411393   
WDId41qG  0.317433  0.310673  0.294377  0.288785  0.346659  0.332620   

          m4026.68  m3448.14  m2869.59  m2291.04  m1712.5  m1133.95  BSAN_EVI  
PIDN                                                                           
XNhoFZW5  0.504770   1.55677  1.011730  0.747903  1.48985   1.19983 -0.547567  
9XNspFTd  0.537780   1.66429  1.050200  0.777105  1.46525   1.36119 -0.547567  
WDId41qG  0.414941   1.36490  0.850053  0.653523  1.42597   1.10501 -1.052489  

Regression

Run a regression on the data.

In [25]:
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import Lasso, ElasticNet, RANSACRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.feature_selection import RFE, RFECV
from sklearn import cross_validation, grid_search
from functools import partial

classifiers = OrderedDict([
('Lasso', (Lasso(), 
    {'Ca':{'alpha':0.0005}, 'P':{'alpha':0.3}, 'pH':{'alpha':0.001}, 'SOC':{'alpha':0.0001}, 'Sand':{'alpha':0.0005}},
    {'alpha':[0.0001,.0005,0.001,.005,.01,.1,.3,.5,.7,1]})),
('Elastic Net', (ElasticNet(), 
    {'Ca':{'alpha':0.001, 'l1_ratio':0.1}, 'P':{'alpha':1, 'l1_ratio':0.0001}, 'SOC':{'alpha':0.0001, 'l1_ratio':0.5},
     'Sand':{'alpha':0.001, 'l1_ratio':0.1}, 'pH':{'alpha':0.01, 'l1_ratio':0.001}},
    {'alpha':[0.0001,.0005,.001,.01,.1,.5,1], 'l1_ratio':[0.0001,.0005,.001,.01,.1,.5,1]})),
('Random Forest', (RandomForestRegressor(), 
    {'Ca':{'n_estimators':20}, 'P':{'n_estimators':50}, 'pH':{'n_estimators':20}, 'SOC':{'n_estimators':10},
     'Sand':{'n_estimators':20}},
    {'n_estimators':[1,2,4,7,10,20]})),
("SVR", (SVR(),
    {'Ca':{'kernel':'linear', 'C':0.9, 'epsilon':0.1, 'shrinking':True}, 
     'P':{'kernel':'linear', 'C':1, 'epsilon':0.15, 'shrinking':True}, 
     'SOC':{'kernel':'linear', 'C':1, 'epsilon':0.2, 'shrinking':False}, 
     'Sand':{'kernel':'linear', 'C':0.3, 'epsilon':0.01, 'shrinking':False}, 
     'pH':{'kernel':'linear', 'C':0.1, 'epsilon':0.5, 'shrinking':True}},
    {'kernel':['linear'], 'C':[0.1,.3,.5,.9,1,10,100,1000], 'epsilon':[.01,.05,.1,.2,.5], 'shrinking':[True,False]})),
("KNN", (KNeighborsRegressor(),
    {'Ca':{'n_neighbors':1}, 'P':{'n_neighbors':1}, 'SOC':{'n_neighbors':1}, 'pH':{'n_neighbors':1},'Sand':{'n_neighbors':1}},
    {'n_neighbors':[1,2,4,6,10,20,40]})),
('RANSAC', (RANSACRegressor(),
    {'Ca':{'min_samples':100}, 'P':{'min_samples':100},'pH':{'min_samples':100},
     'SOC':{'min_samples':100},'Sand':{'min_samples':100}},
    {'max_trials':[100,200]})),
])


def do_cv(pred, classifiers, do_gridsearch=False, num_cv=5, n_jobs=5):
    print("{} CV RMSE:".format(pred))

    y = ys[pred]
    kfolder = cross_validation.StratifiedKFold(y, shuffle=True, n_folds=num_cv)
    mean_scores = {}
    gridsearch_params = {cname:{} for cname in classifiers}
    rfe_features = X.columns
    
    if do_gridsearch:
        for cname, (classifier, params, all_grid_params) in classifiers.items():
            clf = grid_search.GridSearchCV(classifier, all_grid_params, n_jobs=n_jobs, cv=kfolder, scoring='mean_squared_error')
            result = clf.fit(X, y)
            print("{} best_params: {}".format(cname, result.best_params_))
            gridsearch_params[cname][pred] = result.best_params_
    
    for cname, (classifier, params, all_grid_params) in classifiers.items():
        use_params = gridsearch_params[cname][pred] if pred in gridsearch_params[cname] else params[pred]
        classifier.set_params(**use_params)
        HACK = -1 # https://github.com/scikit-learn/scikit-learn/issues/2439
        scores = HACK * cross_validation.cross_val_score(classifier, X[rfe_features], y, n_jobs=n_jobs, cv=kfolder, scoring='mean_squared_error')
        print("\t{}: {:.4f} ({})".format(cname.ljust(14), scores.mean(), ','.join("{:.4f}".format(s) for s in scores)))
        mean_scores[cname] = scores.mean()
    return mean_scores, gridsearch_params, rfe_features

#
# Learning parameters, cross-validation
#
def gridsearch(ypreds, classifiers, do_gridsearch=False, do_rfe=True):
    all_scores = {}
    rfe_features = {}
    for pred in ypreds:
        all_scores[pred], gridsearch_params, rfe_features[pred] = \
            do_cv(pred, classifiers, num_cv=5, do_gridsearch=do_gridsearch, n_jobs=N_CPUS)
    return all_scores, gridsearch_params, rfe_features

def print_all_scores(all_scores, classifiers):
    print("\n"+"All scores".center(22,"-"))
    for classifier in classifiers:
        smean = 0
        for pred in all_scores:
            smean += all_scores[pred][classifier]
        print("{}: {:.4f}".format(classifier.ljust(14), smean / len(all_scores)))

# Only trying a couple of the available classifiers at a time...
# Elastic Net sometimes has trouble converging
classifiers = {c:classifiers[c] for c in ['SVR','KNN']}#,'Random Forest','Elastic Net','Lasso']}
all_scores, gridsearch_params, rfe_features = gridsearch(soil_properties, classifiers, do_gridsearch=False)
print_all_scores(all_scores, classifiers)
Ca CV RMSE:
	KNN           : 0.1848 (0.1494,0.1768,0.0886,0.3276,0.1815)
	SVR           : 0.1667 (0.1255,0.2382,0.0439,0.2731,0.1529)
P CV RMSE:
	KNN           : 0.9644 (0.3028,0.7544,0.8154,1.6661,1.2835)
	SVR           : 0.9598 (0.4334,0.5857,0.5892,1.9257,1.2650)
pH CV RMSE:
	KNN           : 0.3204 (0.3392,0.3021,0.3067,0.2930,0.3611)
	SVR           : 0.1882 (0.1595,0.2018,0.1848,0.1639,0.2309)
SOC CV RMSE:
	KNN           : 0.3424 (0.3376,0.4245,0.3090,0.2753,0.3656)
	SVR           : 0.1434 (0.1461,0.1577,0.1432,0.1218,0.1479)
Sand CV RMSE:
	KNN           : 0.2194 (0.3324,0.1854,0.2372,0.2095,0.1325)
	SVR           : 0.1432 (0.1527,0.1480,0.1756,0.1344,0.1052)

------All scores------
KNN           : 0.4063
SVR           : 0.3202

Submitting to Kaggle

Here I apply the winning regression and format the submission.

In [26]:
#
# Submitting...
#

def create_sub_pred(cname, pred):
    classifier, params, grid_params = classifiers[cname]
    classifier.set_params(**params[pred])    
    y_pred = classifier.fit(X[rfe_features[pred]], ys[pred]).predict(X_test[rfe_features[pred]])
    
    if cname == "SVR":
        print("\nThe top 10 most informative factors for {}, according to Linear SVM\n".format(pred))
        print(sorted(zip(X.columns, classifier.coef_[0]), key=lambda x: -abs(x[1]))[:10])
    
    return y_pred

def create_sub_df():
    y_pred = OrderedDict()
    cnamedict = {"Ca":"SVR", "P":"SVR", "pH":"SVR", "SOC":"SVR", "Sand":"SVR"}
    for pred in soil_properties:
        y_pred[pred] = create_sub_pred(cnamedict[pred], pred)
        if pred == 'P':
            y_pred[pred] = unlog_transform_col(y_pred[pred], min_P)
    return pd.DataFrame(y_pred, index=X_test.index)

if set(rfe_features.keys()) == set(soil_properties):
    # Only if I have a model for all soil_properties..
    df = create_sub_df()
    open("sub.csv",'w').write(df.to_csv())
    print("Submission created!")
else:
    logging.warn("Missing information (only {} available). Not creating submission.".format(rfe_features.keys()))

The top 10 most informative factors for Ca, according to Linear SVM

[('m1795.42', -0.3920469364107968), ('m1793.49', -0.39115819703121812), ('m1797.35', -0.36600032719369668), ('BSAS', 0.35667456211375914), ('m1791.57', -0.32334899883718415), ('m1799.28', -0.32049552738594345), ('m2508.96', -0.2785886384558075), ('m2510.89', -0.27625191018048778), ('m2507.04', -0.27248981977650544), ('m2512.82', -0.26714522972483978)]

The top 10 most informative factors for P, according to Linear SVM

[('BSAN', 0.68849191151352251), ('BSAV', -0.30326830375195568), ('m3646.77', -0.25082329588314045), ('m3644.84', -0.24013451232434635), ('m1619.93', -0.21729672246646148), ('m1621.86', -0.21534719661382207), ('m971.958', 0.21103632910955983), ('m782.966', 0.21082814553615581), ('m1618', -0.21073860330838667), ('m1623.79', -0.20723967072400629)]

The top 10 most informative factors for pH, according to Linear SVM

[('ELEV', 0.16276673200764505), ('LSTN', 0.15471189560638421), ('TMAP', 0.13006711707785221), ('REF7', -0.098146400498278094), ('BSAV', 0.091703730430462704), ('BSAS', 0.09048401797854147), ('m813.822', 0.086046647652885), ('LSTD', -0.082663027880160139), ('REF2', 0.082411553604997231), ('m815.751', 0.082206163180870012)]

The top 10 most informative factors for SOC, according to Linear SVM

[('BSAV', 0.73567457857865259), ('m698.113', 0.53498287994672067), ('m1793.49', -0.50104992542188009), ('m1795.42', -0.49402961672908985), ('m700.041', 0.49270644773259309), ('m715.469', -0.46358283526150013), ('m696.184', 0.45678035867434441), ('m1797.35', -0.44569291690368895), ('m1791.57', -0.44137624751400351), ('m1687.43', -0.4375649974878828)]

The top 10 most informative factors for Sand, according to Linear SVM

[('REF7', -0.58704928103636655), ('BSAV', 0.50726210356080714), ('m813.822', -0.36712184201904807), ('m811.894', -0.33720714674629948), ('m1872.56', 0.29884982891257533), ('TMAP', 0.29857480289475741), ('m804.18', 0.29835488171568703), ('m1870.63', 0.29810297429791477), ('m1874.49', 0.29652027761838334), ('m1868.71', 0.29258586565132616)]
Submission created!

In [8]: