Some simple experiments in linear regression using scipy, Stan, and PyMC3.

Take some data from Google spreadsheets that includes a response variable (y) and one or more predictors (x1, x2). I assume that the first column is the response variable and apply linear regression several different ways.

from __future__ import print_function, division

import requests
import numpy as np
import seaborn as sns
import pandas as pd
import pymc3 as pm
import pystan
import matplotlib as mpl
import matplotlib.pyplot as plt

from StringIO import StringIO
from pprint import pprint
from IPython.display import display, HTML, SVG
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from scipy.stats import norm, probplot

def uprint(astr): print(astr + "\n" + "-"*len(astr))
def show_html(astr): return display(HTML('<center>{}</center>'.format(astr)))

def show_svg(astr, w=1000, h=1000):
    SVG_HEAD = '''<?xml version="1.0" standalone="no"?><!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">'''
    SVG_START = '''<svg width="{w:}px" height="{h:}px" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink= "http://www.w3.org/1999/xlink">'''
    return display(SVG(SVG_HEAD + SVG_START.format(w=w, h=h) + astr + '</svg>'))

# Plotting style
plt.rc("axes", titlesize=20, labelsize=15, linewidth=.25, edgecolor='#444444')
sns.set_context("notebook", font_scale=1.2, rc={})
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

Data

Here we load the data into a DataFrame and look at a summary.

# Read in data from google spreadsheet
gsheet_key = "1q2xrN2YQ4KrGedaupDX2aM9dWGSmz23v7UF2GxwlO0A"
gsheet_url = "https://docs.google.com/spreadsheets/d/1q2xrN2YQ4KrGedaupDX2aM9dWGSmz23v7UF2GxwlO0A/edit?usp=sharing"

csv = requests.get("https://docs.google.com/spreadsheets/d/{}/export?gid=0&format=csv".format(gsheet_key))
df = pd.DataFrame.from_csv(StringIO(csv.text), index_col=False)

show_html("<h1>Data summary</h1>")
show_html(df.describe(percentiles=[]).to_html())

TEST_LOG = False
if TEST_LOG:
    df["x3"] = np.log(df["x3"])

Data summary

	y	x1	x2	x3
count	100	100	100	100
mean	0.016842	-0.052540	-0.027275	0.011383
std	0.092049	0.969025	0.959984	0.375131
min	-0.204356	-2.257561	-2.447623	-0.859929
50%	0.012005	0.019029	-0.072433	0.040495
max	0.252106	2.304593	2.329134	0.934868

Standardizing data

By standardizing the data (subtracting the mean and dividing by the standard deviation), we can avoid some common problems. Specifically, when variables are on vastly different scales it can cause issues.

See http://andrewgelman.com/2009/07/11/when_to_standar/ for a discussion of standardization.

df_fits = {}
for col in df.columns:
    _df_fit = StandardScaler().fit(df[col])
    df[col] = _df_fit.transform(df[col])
    df_fits[col] = _df_fit.inverse_transform

show_html("<h1>Data overview, all-vs-all</h1>")
g = sns.PairGrid(df)
#g.fig.suptitle('Data overview, all-vs-all', y=1.05, fontsize=28)
g.map_upper(sns.regplot)
g.map_lower(sns.kdeplot, cmap="Blues_d")
g.map_diag(plt.hist)

Data overview, all-vs-all

Are the data normal?

Regression performs better if the data are approximately normal.

It's not uncommon for some data types to be closer to log-normal than normal, i.e., the data look normal after taking the log. For example, adult blood pressure is approximately log-normal. If that is the case, we will use the logged version.

fig, ax = plt.subplots(figsize=(16,12))
#fig.suptitle('Compare fits for data vs logged data ', y=1.05, fontsize=28)
show_html("<h1>Compare fits for data vs logged data</h1>")
plt.subplots_adjust(hspace=1)
eps = 1e-4

for i, var in enumerate(df.columns):
    ax1 = plt.subplot(len(df.columns), 2, i*2+1)
    ax2 = plt.subplot(len(df.columns), 2, i*2+2)

    #
    # Original data (no log applied)
    #
    (_osm, _osr), (_slope, _intercept, _R) = probplot(df[var])
    ax1.scatter(_osm, _osr, color='steelblue')
    ax1.plot([_osm.min(), _osm.max()], [_intercept + _slope*_osm.min(), _intercept + _slope*_osm.max()], color='seagreen')
    ax1.set(title="{}  probability plot (no log) $R^2:{:.3f}$".format(var, _R**2))

    #
    # Data with log applied
    #
    log_X_var = np.log(eps - df[var].min() + df[var])
    (_osm, _osr), (_slope, _intercept, _Rlog) = probplot(log_X_var)

    ax2.scatter(_osm, _osr, color='steelblue')
    ax2.plot([_osm.min(), _osm.max()], [_intercept + _slope*_osm.min(), _intercept + _slope*_osm.max()], color='seagreen')
    ax2.set(title="{}  probability plot (log) $R^2:{:.3f}$".format(var, _Rlog**2))

    #
    # Decide whether to use original version or the logged version
    #
    if _Rlog > _R:
        _tmpfn1, _tmpval = df_fits[var], df[var].min() # necessary to avoid recursion problem
        df_fits[var] = lambda x: _tmpfn1(np.exp(x) + _tmpval - eps) # change inverse transform
        df[var] = np.log(eps - df[var].min() + df[var])
        annotate_ax = ax2
    else:
        annotate_ax = ax1

    annotate_ax.annotate("Higher $R^2$", xy=(0.7,0.15), xytext=(0.7,0.15), textcoords='axes fraction', color='indianred', fontsize=24)

plt.tight_layout()

Compare fits for data vs logged data

Linear Regression

Apply ordinary least-squares linear regression.

X = df[df.columns[1:]].values
y = df[df.columns[0]].values

ols = LinearRegression(fit_intercept=True, normalize=False, n_jobs=-1)
ols.fit(X, y)

show_html('''
<h1>OLS linear regression results</h1>
<table>
<tr><td>Intercept</td><td>{intercept:.3f}</td></tr>
<tr><td>Coefficients</td><td>{coef:s}</td></tr>
<tr><td>Residual sum of squares</td><td>{rss:.3f}</td></tr>
<tr><td>$R^2$ of the prediction.</td><td>{score:.3f}</td></tr>
</table>'''.format(intercept = ols.intercept_,
            coef = ', '.join("{:s}:{:.3f}".format(df.columns[1:][i], ols.coef_[i]) for i in range(len(ols.coef_))),
            rss = np.mean((ols.predict(X) - y) ** 2),
            score = ols.score(X, y))
)

OLS linear regression results

Intercept	0.000
Coefficients	x1:0.218, x2:0.163, x3:0.126
Residual sum of squares	0.911
$R^2$ of the prediction.	0.089

# http://www.stat.columbia.edu/~gelman/presentations/stantalk2014_handout.pdf
# Instead of a gamma use a half-normal, as Gelman recommends (see stan manual "Truncated Priors")
stan_regress = """
data {
  int N;
  int K;
  vector[N] y;
  matrix[N,K] X;
}
parameters {
  real intercept;
  vector[K] betas;
  real<lower=0.01> sigma;
}
model {
  sigma ~ normal(0,1);
  y ~ normal(intercept + X*betas, sigma);
}"""

stan_data = {'X': X, 'y': y, 'N': X.shape[0], 'K': X.shape[1]}

# Fit the model
fit = pystan.stan(model_code=stan_regress, data=stan_data, iter=1000, chains=4)

print(fit)

Inference for Stan model: anon_model_fa066f9745cd16eb2f6cf993e14d51c9.
4 chains, each with iter=1000; warmup=500; thin=1;
post-warmup draws per chain=500, total post-warmup draws=2000.

            mean se_mean     sd   2.5%    25%    50%    75%  97.5%  n_eff   Rhat
intercept 4.0e-3  4.0e-3    0.1  -0.18  -0.06 5.8e-3   0.07   0.19  564.0    1.0
betas[0]    0.22  4.2e-3    0.1 7.6e-3   0.15   0.22   0.29   0.41  587.0    1.0
betas[1]    0.17  4.0e-3    0.1  -0.03    0.1   0.17   0.23   0.37  615.0    1.0
betas[2]    0.13  4.1e-3    0.1  -0.07   0.06   0.13    0.2   0.32  634.0    1.0
sigma       0.98  2.9e-3   0.07   0.85   0.94   0.98   1.03   1.13  562.0    1.0
lp__      -48.43    0.08   1.58 -52.34 -49.28 -48.16 -47.24 -46.26  427.0    1.0

Samples were drawn using NUTS(diag_e) at Mon May  4 11:15:39 2015.
For each parameter, n_eff is a crude measure of effective sample size,
and Rhat is the potential scale reduction factor on split chains (at
convergence, Rhat=1).

Bayesian Regression Models (MCMC)

Bayesian regression is more flexible than OLS. We can choose priors that best represent the data.

burnin, num_samples = 50, 200
np.random.seed(1)

# Add a column for the intercept (x_0)
X_1 = np.column_stack( (np.ones(X.shape[0]),  X) )
beta_names = ["intercept"] + list(df.columns[1:]) #['beta_{}'.format(i) for i in range(X_1.shape[1])]

def regress(regression="ridge", error="normal", beta_names=beta_names, nu=5):
    with pm.Model() as model:
        #
        # Priors
        #
        if regression == "ols":
            betas = [pm.Uniform(beta, -10, 10) for beta in beta_names] # >10 is really slow...
        elif regression == "ridge":
            betas = [pm.Normal(beta, mu=0, tau=1.0) for beta in beta_names]
        elif regression == "lasso":
            betas = [pm.Laplace(beta, mu=0, b=1.0) for beta in beta_names]
        else:
            raise ValueError, "No regression of type {}".format(regression)

        # prior on Normal/T error variance. Gamma(1,1) has mean=1, variance=1
        # HalfCauchy is better...
        tau = pm.HalfCauchy('tau', 1, testval=1.0)

        #
        # Likelihood
        #
        # mu ~ beta0 + beta1 * x1 + beta2 * x2 + beta3 * x3
        mu = sum(betas[n]*X_1[:,n] for n in range(len(betas)))

        if error == "normal":
            y_obs = pm.Normal('y_obs', mu=mu, tau=tau, observed=y)
        elif error == "T":
            y_obs = pm.T('y_obs', nu=nu, mu=mu, lam=tau, observed=y)
        else:
            raise ValueError, "No error model of type {}".format(error)

        #
        # Sample
        #
        start = pm.find_MAP()
        print("starting trace at MAP:", start)
        for k in start:
            if start[k] == np.array(0.0) or True: start[k] += .01 # avoids some corner cases?
        step = pm.NUTS(state=start, scaling=start)
        trace = pm.sample(num_samples, step, start, progressbar=True, njobs=1)[burnin:]
        #pm.traceplot(trace)
    return trace

trace = {}

show_html("<h1>Ordinary Least Squares</h1>")
trace['ols'] = regress(regression="ols", error="normal")
print(trace['ols'])
#pm.traceplot(trace['ols'])

Ordinary Least Squares

starting trace at MAP: {'x2': array(0.16259050929728255), 'x3': array(0.12564858828139785), 'x1': array(0.21839587006704464), 'intercept': array(-1.779530779589083e-08), 'tau': array(1.0740140634338355)}
 [-----------------100%-----------------] 200 of 200 complete in 0.5 sec<MultiTrace: 1 chains, 150 iterations, 5 variables>

show_html("<h1>Ridge Regression</h1>")
trace['ridge/T'] = regress(regression="ridge", error="T")
pm.traceplot(trace['ridge/T'])

Ridge Regression

starting trace at MAP: {'x2': array(0.1778392943710258), 'x3': array(0.08305421591573277), 'x1': array(0.22367363162561957), 'intercept': array(-0.00609829689854981), 'tau': array(1.3774522554543267)}
 [-----------------100%-----------------] 200 of 200 complete in 0.6 sec

show_html("<h1>LASSO</h1>")
trace['lasso/T'] = regress(regression="lasso", error="T")
pm.traceplot(trace['lasso/T'])

LASSO

starting trace at MAP: {'x2': array(0.17055991944072316), 'x3': array(0.07282629907667493), 'x1': array(0.21454246999666238), 'intercept': array(-0.0002746999774107022), 'tau': array(1.3787986641723053)}
 [-----------------100%-----------------] 200 of 200 complete in 0.6 sec

#--------------------------------------------
# Compare PyMC models
#
stats, quartiles = {}, {}
for regression in trace:
    stats[regression], quartiles[regression] = {}, {}

    for var in trace[regression].varnames:
        post_quar = pm.stats._calculate_posterior_quantiles(trace[regression][var], qlist=[5,25,50,75,95])
        _key, quartiles[regression][var] = post_quar.next(), post_quar.next()

        post_stats = pm.stats._calculate_stats(trace[regression][var], batches=100, alpha=0.05)
        _key, stats[regression][var] = post_stats.next(), post_stats.next()

# Graph all models
min_beta = min(min(quartiles[regression][var].values()) for regression in trace for var in beta_names)
max_beta = max(max(quartiles[regression][var].values()) for regression in trace for var in beta_names)

rh = 20
rh2 = 20*1.5*len(beta_names)
svgw = 1000
svgh = rh2 * len(beta_names)
def _norm(val): return svgw * (val - min_beta) / (max_beta - min_beta)
def _normw(val): return svgw * val / (max_beta - min_beta)

ssvg = ''
colors = ["#DD5C5C ", "#54CD7F", "#54BFFF", "#44dd99"]
for i, beta in enumerate(beta_names):
    for j, regression in enumerate(trace):
        color = colors[j%len(colors)]

        s, q = stats[regression][beta], quartiles[regression][beta]
        w = q['q75'] - q['q25']
        xj = q['q25']
        yj = rh2*i + rh*j
        m, lo, hi = s['mean'], q['lo'], q['hi']

        ssvg += '<line x1="{x1:f}" y1="{y1:f}" x2="{x2:f}" y2="{y2:f}" stroke="#777777" stroke-width=".5"/>'.format(
            x1=_norm(lo), y1=yj+rh/2, x2=_norm(hi), y2=yj+rh/2)
        ssvg += '<rect x="{x:f}" y="{y:f}" width="{w:f}" height="{h:f}" fill="{color:s}" />'.format(
            x=_norm(xj), w=_normw(w), y=yj, h=rh, color=color)
        ssvg += '<rect x="{x:f}" y="{y:f}" width="2px" height="{h:f}" fill="#ffffff" />'.format(
            x=_norm(m), w=2, y=yj, h=rh)
        ssvg += '<text x="{x:f}" y="{y:f}" text-anchor="middle" dy="0.3em">{text}</text>'.format(
            x=_norm(xj + w/2), y=yj+rh/2, text=beta)
        ssvg += '<text x="{x:f}" y="{y:f}" text-anchor="end" dy="0.3em" fill="#ffffff" font-size="10">{text}</text>'.format(
            x=_norm(xj + w) - 3, y=yj+rh/2, text=regression)

show_html("<h1>Analysis of the difference between different regressions</h1>")
show_svg(ssvg, w=svgw, h=svgh)

Analysis of the difference between different regressions

Comment

Linear Regression Experiments