import graphlab
import numpy as np
sales = graphlab.SFrame('kc_house_data.gl/')
#Some new features
from math import log, sqrt
sales['sqft_living_sqrt'] = sales['sqft_living'].apply(sqrt)
sales['sqft_lot_sqrt'] = sales['sqft_lot'].apply(sqrt)
sales['bedrooms_square'] = sales['bedrooms']*sales['bedrooms']
# In the dataset, 'floors' was defined with type string,
# so we'll convert them to float, before creating a new feature.
sales['floors'] = sales['floors'].astype(float)
sales['floors_square'] = sales['floors']*sales['floors']
[INFO] graphlab.cython.cy_server: GraphLab Create v1.10.1 started. Logging: C:\Users\RUNERI~1\AppData\Local\Temp\graphlab_server_1466348297.log.0
INFO:graphlab.cython.cy_server:GraphLab Create v1.10.1 started. Logging: C:\Users\RUNERI~1\AppData\Local\Temp\graphlab_server_1466348297.log.0
This trial license of GraphLab Create is assigned to applerised@gmail.com and will expire on July 18, 2016. Please contact trial@dato.com for licensing options or to request a free non-commercial license for personal or academic use.
Let us fit a model with all the features available, plus the features we just created above.
all_features = ['bedrooms', 'bedrooms_square', 'bathrooms', 'sqft_living', 'sqft_living_sqrt', 'sqft_lot', 'sqft_lot_sqrt',
'floors', 'floors_square','waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built',
'yr_renovated']
model_all = graphlab.linear_regression.create(sales, target='price', features=all_features, validation_set=None, l2_penalty=0.,
l1_penalty=1e10, verbose=False)
model_all.coefficients.print_rows(18)
+------------------+-------+---------------+--------+
| name | index | value | stderr |
+------------------+-------+---------------+--------+
| (intercept) | None | 274873.05595 | None |
| bedrooms | None | 0.0 | None |
| bedrooms_square | None | 0.0 | None |
| bathrooms | None | 8468.53108691 | None |
| sqft_living | None | 24.4207209824 | None |
| sqft_living_sqrt | None | 350.060553386 | None |
| sqft_lot | None | 0.0 | None |
| sqft_lot_sqrt | None | 0.0 | None |
| floors | None | 0.0 | None |
| floors_square | None | 0.0 | None |
| waterfront | None | 0.0 | None |
| view | None | 0.0 | None |
| condition | None | 0.0 | None |
| grade | None | 842.068034898 | None |
| sqft_above | None | 20.0247224171 | None |
| sqft_basement | None | 0.0 | None |
| yr_built | None | 0.0 | None |
| yr_renovated | None | 0.0 | None |
+------------------+-------+---------------+--------+
[18 rows x 4 columns]
To find a good L1 penalty, we will explore multiple values using a validation set. Let us do three way split into train, validation, and test sets:
#Training, Validation, Testing
(training_and_validation, testing) = sales.random_split(.9,seed=1)
(training, validation) = training_and_validation.random_split(0.5, seed=1)
Create models with different lambda values, searching for lambda with lowest RSS
minRss = 1e20
bestPenalty = 0
for l1_penalty in np.logspace(1, 7, num=13):
model = graphlab.linear_regression.create(training, target='price', features=all_features, validation_set=None,
l2_penalty=0., l1_penalty=l1_penalty, verbose=False)
predicted = model.predict(validation)
rss = ((predicted - validation['price'])**2).sum()
if minRss > rss:
minRss = rss
bestPenalty = l1_penalty
bestModel = model
print "The best model has", (bestModel['coefficients']['value'].nnz()), "non zero coefficients, its RSS on validation is",\
minRss, "for a lambda of", bestPenalty
The best model has 18 non zero coefficients, its RSS on validation is 6.25766285142e+14 for a lambda of 10.0
Calculate the RSS of this model on the test data.
model_all = graphlab.linear_regression.create(training, target='price', features=all_features, validation_set=None,
l2_penalty=0., l1_penalty=10, verbose=False)
predicted = model_all.predict(testing)
rss = ((predicted - testing['price'])**2).sum()
print rss
1.56983602382e+14
What if we wanted to limit ourselves to, say, 7 features? This may be important if we want an interpretable model that has only a few features in them.
Let’s define a wide range of possible l1_penalty_values. Remember the smallest lambda with less than 7 nonzeros, and the biggest lambda with more than 7 nonzeros.
prefNonzeros = 7
l1_penalty_values = np.logspace(8, 10, num=20)
flag = True
for l1_penalty in np.logspace(8, 10, num=20):
model = graphlab.linear_regression.create(training, target='price', features=all_features, validation_set=None,
l2_penalty=0., l1_penalty=l1_penalty, verbose=False)
nonZeros = model['coefficients']['value'].nnz()
if nonZeros > prefNonzeros:
l1_penalty_min = l1_penalty
if (nonZeros < prefNonzeros and flag):
l1_penalty_max = l1_penalty
flag = False
print 'Minimum lambda', l1_penalty_min
print 'Maximum lambda', l1_penalty_max
Minimum lambda 2976351441.63
Maximum lambda 3792690190.73
We will now explore the narrow region of l1_penalty values we found:
l1_penalty_values = np.linspace(l1_penalty_min,l1_penalty_max,20)
minRss = None
bestPenalty = 0
for l1_penalty in l1_penalty_values:
model = graphlab.linear_regression.create(training, target='price', features=all_features, validation_set=None,
l2_penalty=0., l1_penalty=l1_penalty, verbose=False)
predicted = model.predict(validation)
rss = ((predicted - validation['price'])**2).sum()
if model['coefficients']['value'].nnz() == prefNonzeros:
if minRss is None or minRss > rss:
minRss = rss
bestPenalty = l1_penalty
bestModel = model
print "The best model has", (bestModel['coefficients']['value'].nnz()), "non zero coefficients, its RSS is on validation", \
minRss, "for a lambda of", bestPenalty
bestModel.coefficients.print_rows(18)
The best model has 7 non zero coefficients, its RSS is on validation 1.04693748875e+15 for a lambda of 3448968612.16
+------------------+-------+---------------+--------+
| name | index | value | stderr |
+------------------+-------+---------------+--------+
| (intercept) | None | 222253.192544 | None |
| bedrooms | None | 661.722717782 | None |
| bedrooms_square | None | 0.0 | None |
| bathrooms | None | 15873.9572593 | None |
| sqft_living | None | 32.4102214513 | None |
| sqft_living_sqrt | None | 690.114773313 | None |
| sqft_lot | None | 0.0 | None |
| sqft_lot_sqrt | None | 0.0 | None |
| floors | None | 0.0 | None |
| floors_square | None | 0.0 | None |
| waterfront | None | 0.0 | None |
| view | None | 0.0 | None |
| condition | None | 0.0 | None |
| grade | None | 2899.42026975 | None |
| sqft_above | None | 30.0115753022 | None |
| sqft_basement | None | 0.0 | None |
| yr_built | None | 0.0 | None |
| yr_renovated | None | 0.0 | None |
+------------------+-------+---------------+--------+
[18 rows x 4 columns]