This is a chunk of starter code to allow you to begin developing your scripts for the Kaggle Tracy Regression competition.

Kaggle usually expects that the training and test files are in an input directory, so that is how this is configured.

I am using github, so my directories are in a github tree. First, we establish the working directory.

library(ggplot2)

setwd("~/GitHub/DataScienceCurriculum/DataScienceCurriculum/DiamondRegression")

# read the input files

train = read.csv ("input/train.csv")
test = read.csv ("input/test.csv")
sample_submission = read.csv("input/sample_submission.csv")

head(train)
##   ID carat       cut color clarity depth table price    x    y    z
## 1  1  0.72   Premium     F     SI1  61.6    60  2314 5.79 5.73 3.55
## 2  2  0.51     Ideal     F    VVS1  62.0    57  2812 5.15 5.11 3.18
## 3  3  0.50 Very Good     F     SI1  63.1    61  1106 5.06 4.98 3.17
## 4  4  1.51   Premium     G     SI1  62.2    58 10497 7.26 7.30 4.53
## 5  5  1.08     Ideal     J     SI2  61.8    57  3537 6.56 6.58 4.06
## 6  6  0.31     Ideal     G     VS2  59.1    57   544 4.45 4.48 2.64
head(test)
##      ID carat       cut color clarity depth table    x    y    z
## 1 37759  0.22      Fair     E     VS2  65.1    61 3.87 3.78 2.49
## 2 37760  0.23 Very Good     H     VS1  59.4    61 4.00 4.05 2.39
## 3 37761  0.30      Good     J     SI1  64.0    55 4.25 4.28 2.73
## 4 37762  0.22   Premium     F     SI1  60.4    61 3.88 3.84 2.33
## 5 37763  0.31     Ideal     J     SI2  62.2    54 4.35 4.37 2.71
## 6 37764  0.20   Premium     E     SI2  60.2    62 3.79 3.75 2.27
head(sample_submission)
##      ID price
## 1 37759  1000
## 2 37760  1000
## 3 37761  1000
## 4 37762  1000
## 5 37763  1000
## 6 37764  1000

For a sample submission, just do a simple linear regression on the log of price vs. carat

lmod = lm(log1p(price) ~ carat, data=train)

# it is necessary to cover the predicted values back to non-log values 
# use the exp function

train.predict <- exp(predict(lmod)) - 1
test.predict = exp(predict(lmod, newdata=test)) -1
benchmark_submission = sample_submission
benchmark_submission$price = test.predict
write.csv(benchmark_submission,"benchmark_submission.csv",row.names = F)

We can compute the RMSE of the log value with this code

RSS <- c(crossprod(lmod$residuals))

MSE <- RSS / length(lmod$residuals)

#Root MSE:

RMSE <- sqrt(MSE)
RMSE
## [1] 0.3979422