train = read.csv('reg_train_in.csv')
names(train)
## [1] "Point_ID" "Input_1" "Input_2" "Input_3" "Input_4" "Input_5"
## [7] "Input_6" "Input_7" "Input_8" "Input_9" "Input_10" "Input_11"
## [13] "Input_12" "Input_13" "Input_14"
Plots of label vs feature
library(plyr)
library(ggplot2)
plotlist = llply(names(train)[names(train) != "Point_ID"], function(featureName) {
p = ggplot(train, aes_string(x=featureName, y="Point_ID")) +
geom_point(alpha=.3)
return(p)
})
source('multiplot.R')
multiplot(plotlist=plotlist, cols=4)
Baseline OLS model
linModel = lm(Point_ID ~ ., train)
summary(linModel)
##
## Call:
## lm(formula = Point_ID ~ ., data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -548.81 -123.87 -10.65 122.04 481.26
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.725e+02 1.911e+00 194.899 < 2e-16 ***
## Input_1 4.256e+00 4.324e-04 9843.434 < 2e-16 ***
## Input_2 1.743e+02 1.050e+01 16.600 < 2e-16 ***
## Input_3 -2.309e+01 5.657e+00 -4.082 4.48e-05 ***
## Input_4 3.603e+02 1.887e+01 19.097 < 2e-16 ***
## Input_5 -5.930e+01 8.036e+00 -7.379 1.63e-13 ***
## Input_6 -5.672e+02 1.596e+01 -35.539 < 2e-16 ***
## Input_7 -7.801e-01 9.202e-01 -0.848 0.397
## Input_8 1.173e+02 8.881e+00 13.203 < 2e-16 ***
## Input_9 -6.706e+01 9.759e+00 -6.872 6.45e-12 ***
## Input_10 2.105e+02 8.587e+00 24.520 < 2e-16 ***
## Input_11 -2.833e-01 9.146e-01 -0.310 0.757
## Input_12 -1.402e+02 7.821e+00 -17.932 < 2e-16 ***
## Input_13 -1.355e+02 1.061e+01 -12.772 < 2e-16 ***
## Input_14 1.306e+02 9.845e+00 13.271 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 168.5 on 33735 degrees of freedom
## Multiple R-squared: 0.9997, Adjusted R-squared: 0.9997
## F-statistic: 8.061e+06 on 14 and 33735 DF, p-value: < 2.2e-16
It looks like Point_ID is the only linearly correlated feature, with features [2,3,4,5,6,8,9,10,12,13,14] providing some quantization information
linModel = lm(Point_ID ~ Input_1, train)
summary(linModel)
##
## Call:
## lm(formula = Point_ID ~ Input_1, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -389.41 -125.77 -15.76 122.98 417.38
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.904e+02 1.864e+00 209.5 <2e-16 ***
## Input_1 4.251e+00 4.138e-04 10274.4 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 174.2 on 33748 degrees of freedom
## Multiple R-squared: 0.9997, Adjusted R-squared: 0.9997
## F-statistic: 1.056e+08 on 1 and 33748 DF, p-value: < 2.2e-16