SOC, pH, Ca, P, Sand are the five target variables for predictions.
#
# Standard imports including plotting
#
from __future__ import print_function, division
import os
import pandas as pd
import numpy as np
import logging
import multiprocessing
import platform
from joblib import Parallel, delayed
from pprint import pprint
from collections import OrderedDict
from matplotlib import pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")
%matplotlib inline
## %config InlineBackend.figure_format = 'svg'
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 40)
def underline(astr): return "{}\n{}".format(astr, "-"*len(astr))
N_CPUS = multiprocessing.cpu_count() if platform.system() == 'Linux' else 5
print("Using platform {}; NCPUS {}".format(platform.system(), N_CPUS))
#
# SOC, pH, Ca, P, Sand are the five target variables for predictions.
# Same order as sample_submission.csv
# http://afsiskaggle.qed.ai/
#
soil_properties = ["Ca", "P", "pH", "SOC", "Sand"]
training_data = pd.read_csv("training.csv")
test_data = pd.read_csv("sorted_test.csv")
print(underline("Training data info"))
print(training_data.info())
#
# Turn training_data into X, y for sklearn
#
from sklearn.preprocessing import StandardScaler
def get_Xy(t_data):
def dummify(X, column):
dummies = pd.get_dummies(X[column], prefix=column)
X[dummies.columns] = dummies
X.drop(column, axis=1, inplace=True)
X = t_data[[col for col in training_data.columns if col not in soil_properties]]
X.set_index("PIDN", inplace=True)
dummify(X, "Depth")
ys = t_data[[col for col in t_data.columns if col in ["PIDN"] + soil_properties]]
ys.set_index("PIDN", inplace=True)
return X, ys
X, ys = get_Xy(training_data)
X_test, _ys = get_Xy(test_data)
orig_X, orig_ys = X.copy(), ys.copy()
orig_X_test, _orig_ys = X_test.copy(), _ys.copy()
assert(_ys.empty)
print(underline("X info"))
print(X.info())
#
# Investigate P, by far the most difficult soil property to predict
# Maybe map P onto something else? Taking the log helps...
#
import scipy.stats
# there are some repeated values...
map_ranks = zip( sorted(ys["P"]), np.arange(len(ys["P"]))/len(ys["P"]) )
imap_ranks = [(j,i) for i,j in map_ranks]
#ys["Pp"] = [dict(map_ranks)[v] for v in ys["P"]]
(ys['P']/ys['P'].max()).hist(bins=300,figsize=(16,5))
#(ys['Pp']/ys['Pp'].max()).hist(bins=60,figsize=(27,5))
print("P distribution\nmax:{} mean:{} median:{}".format(ys["P"].max(), ys["P"].mean(), ys["P"].median()))
plt.savefig("hist.png")
#
# Plot spectra
#
spectra_cols = [col for col in orig_X.columns if col.startswith("m")][::-1]
notspec_cols = [col for col in orig_X.columns if not col.startswith("m")]
spectra_nums = [float(s[1:]) for s in spectra_cols]
assert(spectra_nums == sorted(spectra_nums))
yy, xx = spectra_nums, orig_X[spectra_cols]
plt.figure(figsize=(27,9))
from random import random
for i in range(len(xx))[:10]:
_c = [random(), random(), random(), .2]
plt.scatter(yy,xx.ix[i], color=_c, alpha=0.1)
#
# Use an SVD here to look at the data structure
#
from sklearn import decomposition
def plot_svd(X):
svd = decomposition.TruncatedSVD(n_components=3)
d3 = svd.fit_transform(X)
plt.figure(figsize=(15,15))
# Show high and low P
plt.figure(figsize=(7,7))
d2 = d3[:,(0,1)]
plt.scatter(*d2.T, color='grey')
ixs = ys.ix[ys.ix[:,'P']>.5].index
posns = [ys.index.tolist().index(i) for i in ixs]
plt.scatter(*d2[posns].T, color='red')
ixs = ys.ix[ys.ix[:,'P']<-4].index
posns = [ys.index.tolist().index(i) for i in ixs]
plt.scatter(*d2[posns].T, color='blue')
# Show seaborn jointplots
plt.figure(figsize=(5,5))
for n, cols in enumerate([(0,1),(0,2),(1,2)]):
d2 = d3[:,cols]
assert(d2.T[0].shape == X.index.shape)
with sns.axes_style("white"):
sns.jointplot(d2.T[0], d2.T[1], kind="hex");
continue
plot_svd(xx)
Add or modify features to see if I can see any improvement.
def add_cwt(_X):
"""continuous wavelet transform did not help"""
from scipy import signal
wavelet = signal.ricker
widths = np.arange(1,4,3) # 1,4,7
cols = ["cwt_{}_{}".format(s,w) for w in widths for s in spectra_cols]
cwt_vals = []
for i in range(len(_X)):
cwt_vals.append( signal.cwt(_X.ix[i,spectra_cols], wavelet, widths) )
flattened_cwt_vals = np.array([row.flatten() for row in cwt_vals])
add_X = pd.DataFrame(flattened_cwt_vals, columns=cols, index=_X.index)
return _X.join(add_X)
def add_peaks(_X):
"""simple peak-finding did not help"""
cols = ["peak_{}".format(s) for s in spectra_cols[1:-1]]
flattened_peaks = []
for i,s in enumerate(spectra_cols):
if i != 0 and i != len(spectra_cols)-1:
flattened_peaks.append(_X.ix[:,s] - (_X.ix[:,spectra_cols[i-1]] + _X.ix[:,spectra_cols[i+1]])/2)
print(np.array(flattened_peaks).shape, len(cols), len(_X.index))
add_X = pd.DataFrame(np.array(flattened_peaks).T, columns=cols, index=_X.index)
return _X.join(add_X)
def add_cross(_X):
"""crossing some of the most predictive features may have helped a bit?
turned on for RFE=1000 by accident"""
X = _X.copy()
old_P_cols = ['m619.045', 'm887.105', 'm885.176', 'm617.116', 'm1560.15', 'm1130.09',
'm620.973', 'm615.188', 'm613.259', 'm889.033', 'm784.895', 'm782.966', 'm1029.81',
'm883.248', 'm1132.02', 'm1031.74', 'm873.605']
P_cols = [c for c in _X.columns if not str(c).startswith("m")]
cols = [c for c in _X.columns if c in P_cols]
for i in range(len(cols)):
for j in range(i,len(cols)):
X[cols[i]+"_"+cols[j]] = X[cols[i]]*X[cols[j]]
return X
def log_transform_col(series):
"""log transform helps for P"""
return np.log(series - np.min(series) + 0.01), np.min(series)
def unlog_transform_col(series, min_series):
_series = np.exp(series)
return _series - 0.01 + min_series
def add_log(_X):
"""crossing some of the most predictive features may have helped a bit?
turned on for RFE=1000 by accident"""
X = _X.copy()
cols = [c for c in _X.columns if not c.startswith("m") and "_" not in c]
for col in cols:
X["log_"+col], _ = log_transform_col(X[col])
return X
def sg_row(y, window_size=5, order=2, deriv=0, rate=1):
"""Savitzky Golay did not help"""
from math import factorial
try:
window_size = np.abs(np.int(window_size))
order = np.abs(np.int(order))
except ValueError, msg:
raise ValueError("window_size and order have to be of type int")
if window_size % 2 != 1 or window_size < 1:
raise TypeError("window_size size must be a positive odd number")
if window_size < order + 2:
raise TypeError("window_size is too small for the polynomials order")
order_range = range(order+1)
half_window = (window_size -1) // 2
# precompute coefficients
b = np.mat([[k**i for i in order_range] for k in range(-half_window, half_window+1)])
m = np.linalg.pinv(b).A[deriv] * rate**deriv * factorial(deriv)
# pad the signal at the extremes with
# values taken from the signal itself
firstvals = y[0] - np.abs( y[1:half_window+1][::-1] - y[0] )
lastvals = y[-1] + np.abs(y[-half_window-1:-1][::-1] - y[-1])
y = np.concatenate((firstvals, y, lastvals))
return np.convolve( m[::-1], y, mode='valid')
# Log transforming P seems to help
ys = orig_ys.copy()
ys.ix[:,'P'], min_P = log_transform_col(orig_ys.ix[:,'P'])
# Adding the crosses for non-spectra columns seems to help
X = add_log(add_cross(orig_X))
X_test = add_log(add_cross(orig_X_test))
# Scale the crosses and the logs
for column in [c for c in X.columns if "_" in c]:
fit = StandardScaler().fit(X[column])
X[column] = fit.transform(X[column])
X_test[column] = fit.transform(X_test[column])
print(underline("X shape and columns"))
print(X.shape)
print(X.ix[:3,::300])
Run a regression on the data.
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import Lasso, ElasticNet, RANSACRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.feature_selection import RFE, RFECV
from sklearn import cross_validation, grid_search
from functools import partial
classifiers = OrderedDict([
('Lasso', (Lasso(),
{'Ca':{'alpha':0.0005}, 'P':{'alpha':0.3}, 'pH':{'alpha':0.001}, 'SOC':{'alpha':0.0001}, 'Sand':{'alpha':0.0005}},
{'alpha':[0.0001,.0005,0.001,.005,.01,.1,.3,.5,.7,1]})),
('Elastic Net', (ElasticNet(),
{'Ca':{'alpha':0.001, 'l1_ratio':0.1}, 'P':{'alpha':1, 'l1_ratio':0.0001}, 'SOC':{'alpha':0.0001, 'l1_ratio':0.5},
'Sand':{'alpha':0.001, 'l1_ratio':0.1}, 'pH':{'alpha':0.01, 'l1_ratio':0.001}},
{'alpha':[0.0001,.0005,.001,.01,.1,.5,1], 'l1_ratio':[0.0001,.0005,.001,.01,.1,.5,1]})),
('Random Forest', (RandomForestRegressor(),
{'Ca':{'n_estimators':20}, 'P':{'n_estimators':50}, 'pH':{'n_estimators':20}, 'SOC':{'n_estimators':10},
'Sand':{'n_estimators':20}},
{'n_estimators':[1,2,4,7,10,20]})),
("SVR", (SVR(),
{'Ca':{'kernel':'linear', 'C':0.9, 'epsilon':0.1, 'shrinking':True},
'P':{'kernel':'linear', 'C':1, 'epsilon':0.15, 'shrinking':True},
'SOC':{'kernel':'linear', 'C':1, 'epsilon':0.2, 'shrinking':False},
'Sand':{'kernel':'linear', 'C':0.3, 'epsilon':0.01, 'shrinking':False},
'pH':{'kernel':'linear', 'C':0.1, 'epsilon':0.5, 'shrinking':True}},
{'kernel':['linear'], 'C':[0.1,.3,.5,.9,1,10,100,1000], 'epsilon':[.01,.05,.1,.2,.5], 'shrinking':[True,False]})),
("KNN", (KNeighborsRegressor(),
{'Ca':{'n_neighbors':1}, 'P':{'n_neighbors':1}, 'SOC':{'n_neighbors':1}, 'pH':{'n_neighbors':1},'Sand':{'n_neighbors':1}},
{'n_neighbors':[1,2,4,6,10,20,40]})),
('RANSAC', (RANSACRegressor(),
{'Ca':{'min_samples':100}, 'P':{'min_samples':100},'pH':{'min_samples':100},
'SOC':{'min_samples':100},'Sand':{'min_samples':100}},
{'max_trials':[100,200]})),
])
def do_cv(pred, classifiers, do_gridsearch=False, num_cv=5, n_jobs=5):
print("{} CV RMSE:".format(pred))
y = ys[pred]
kfolder = cross_validation.StratifiedKFold(y, shuffle=True, n_folds=num_cv)
mean_scores = {}
gridsearch_params = {cname:{} for cname in classifiers}
rfe_features = X.columns
if do_gridsearch:
for cname, (classifier, params, all_grid_params) in classifiers.items():
clf = grid_search.GridSearchCV(classifier, all_grid_params, n_jobs=n_jobs, cv=kfolder, scoring='mean_squared_error')
result = clf.fit(X, y)
print("{} best_params: {}".format(cname, result.best_params_))
gridsearch_params[cname][pred] = result.best_params_
for cname, (classifier, params, all_grid_params) in classifiers.items():
use_params = gridsearch_params[cname][pred] if pred in gridsearch_params[cname] else params[pred]
classifier.set_params(**use_params)
HACK = -1 # https://github.com/scikit-learn/scikit-learn/issues/2439
scores = HACK * cross_validation.cross_val_score(classifier, X[rfe_features], y, n_jobs=n_jobs, cv=kfolder, scoring='mean_squared_error')
print("\t{}: {:.4f} ({})".format(cname.ljust(14), scores.mean(), ','.join("{:.4f}".format(s) for s in scores)))
mean_scores[cname] = scores.mean()
return mean_scores, gridsearch_params, rfe_features
#
# Learning parameters, cross-validation
#
def gridsearch(ypreds, classifiers, do_gridsearch=False, do_rfe=True):
all_scores = {}
rfe_features = {}
for pred in ypreds:
all_scores[pred], gridsearch_params, rfe_features[pred] = \
do_cv(pred, classifiers, num_cv=5, do_gridsearch=do_gridsearch, n_jobs=N_CPUS)
return all_scores, gridsearch_params, rfe_features
def print_all_scores(all_scores, classifiers):
print("\n"+"All scores".center(22,"-"))
for classifier in classifiers:
smean = 0
for pred in all_scores:
smean += all_scores[pred][classifier]
print("{}: {:.4f}".format(classifier.ljust(14), smean / len(all_scores)))
# Only trying a couple of the available classifiers at a time...
# Elastic Net sometimes has trouble converging
classifiers = {c:classifiers[c] for c in ['SVR','KNN']}#,'Random Forest','Elastic Net','Lasso']}
all_scores, gridsearch_params, rfe_features = gridsearch(soil_properties, classifiers, do_gridsearch=False)
print_all_scores(all_scores, classifiers)
Here I apply the winning regression and format the submission.
#
# Submitting...
#
def create_sub_pred(cname, pred):
classifier, params, grid_params = classifiers[cname]
classifier.set_params(**params[pred])
y_pred = classifier.fit(X[rfe_features[pred]], ys[pred]).predict(X_test[rfe_features[pred]])
if cname == "SVR":
print("\nThe top 10 most informative factors for {}, according to Linear SVM\n".format(pred))
print(sorted(zip(X.columns, classifier.coef_[0]), key=lambda x: -abs(x[1]))[:10])
return y_pred
def create_sub_df():
y_pred = OrderedDict()
cnamedict = {"Ca":"SVR", "P":"SVR", "pH":"SVR", "SOC":"SVR", "Sand":"SVR"}
for pred in soil_properties:
y_pred[pred] = create_sub_pred(cnamedict[pred], pred)
if pred == 'P':
y_pred[pred] = unlog_transform_col(y_pred[pred], min_P)
return pd.DataFrame(y_pred, index=X_test.index)
if set(rfe_features.keys()) == set(soil_properties):
# Only if I have a model for all soil_properties..
df = create_sub_df()
open("sub.csv",'w').write(df.to_csv())
print("Submission created!")
else:
logging.warn("Missing information (only {} available). Not creating submission.".format(rfe_features.keys()))