# Linear Regression predicting price of wine
# Read in data
wine = read.csv("wine.csv")
str(wine)
## 'data.frame': 25 obs. of 7 variables:
## $ Year : int 1952 1953 1955 1957 1958 1959 1960 1961 1962 1963 ...
## $ Price : num 7.5 8.04 7.69 6.98 6.78 ...
## $ WinterRain : int 600 690 502 420 582 485 763 830 697 608 ...
## $ AGST : num 17.1 16.7 17.1 16.1 16.4 ...
## $ HarvestRain: int 160 80 130 110 187 187 290 38 52 155 ...
## $ Age : int 31 30 28 26 25 24 23 22 21 20 ...
## $ FrancePop : num 43184 43495 44218 45152 45654 ...
summary(wine)
## Year Price WinterRain AGST HarvestRain
## Min. :1952 Min. :6.21 Min. :376 Min. :15.0 Min. : 38
## 1st Qu.:1960 1st Qu.:6.52 1st Qu.:536 1st Qu.:16.2 1st Qu.: 89
## Median :1966 Median :7.12 Median :600 Median :16.5 Median :130
## Mean :1966 Mean :7.07 Mean :605 Mean :16.5 Mean :149
## 3rd Qu.:1972 3rd Qu.:7.50 3rd Qu.:697 3rd Qu.:17.1 3rd Qu.:187
## Max. :1978 Max. :8.49 Max. :830 Max. :17.6 Max. :292
## Age FrancePop
## Min. : 5.0 Min. :43184
## 1st Qu.:11.0 1st Qu.:46584
## Median :17.0 Median :50255
## Mean :17.2 Mean :49694
## 3rd Qu.:23.0 3rd Qu.:52894
## Max. :31.0 Max. :54602
# examine correlations
cor(wine)
## Year Price WinterRain AGST HarvestRain Age
## Year 1.00000 -0.4478 0.016970 -0.2469 0.02801 -1.00000
## Price -0.44777 1.0000 0.136651 0.6596 -0.56332 0.44777
## WinterRain 0.01697 0.1367 1.000000 -0.3211 -0.27544 -0.01697
## AGST -0.24692 0.6596 -0.321091 1.0000 -0.06450 0.24692
## HarvestRain 0.02801 -0.5633 -0.275441 -0.0645 1.00000 -0.02801
## Age -1.00000 0.4478 -0.016970 0.2469 -0.02801 1.00000
## FrancePop 0.99449 -0.4669 -0.001622 -0.2592 0.04126 -0.99449
## FrancePop
## Year 0.994485
## Price -0.466862
## WinterRain -0.001622
## AGST -0.259162
## HarvestRain 0.041264
## Age -0.994485
## FrancePop 1.000000
round(cor(wine), 2)
## Year Price WinterRain AGST HarvestRain Age FrancePop
## Year 1.00 -0.45 0.02 -0.25 0.03 -1.00 0.99
## Price -0.45 1.00 0.14 0.66 -0.56 0.45 -0.47
## WinterRain 0.02 0.14 1.00 -0.32 -0.28 -0.02 0.00
## AGST -0.25 0.66 -0.32 1.00 -0.06 0.25 -0.26
## HarvestRain 0.03 -0.56 -0.28 -0.06 1.00 -0.03 0.04
## Age -1.00 0.45 -0.02 0.25 -0.03 1.00 -0.99
## FrancePop 0.99 -0.47 0.00 -0.26 0.04 -0.99 1.00
wineReordered = wine[c("Price", "Year", "WinterRain", "AGST", "HarvestRain",
"Age", "FrancePop")]
round(cor(wineReordered), 2)
## Price Year WinterRain AGST HarvestRain Age FrancePop
## Price 1.00 -0.45 0.14 0.66 -0.56 0.45 -0.47
## Year -0.45 1.00 0.02 -0.25 0.03 -1.00 0.99
## WinterRain 0.14 0.02 1.00 -0.32 -0.28 -0.02 0.00
## AGST 0.66 -0.25 -0.32 1.00 -0.06 0.25 -0.26
## HarvestRain -0.56 0.03 -0.28 -0.06 1.00 -0.03 0.04
## Age 0.45 -1.00 -0.02 0.25 -0.03 1.00 -0.99
## FrancePop -0.47 0.99 0.00 -0.26 0.04 -0.99 1.00
pairs(wine)
## Correlogram example
install.packages("corrgram")
## Installing package into '/Applications/RStudio.app/Contents/Resources/R/library'
## (as 'lib' is unspecified)
## Error: trying to use CRAN without setting a mirror
library(corrgram)
## Loading required package: seriation
corrgram(wineReordered, order = TRUE, lower.panel = panel.shade, upper.panel = panel.pie,
text.panel = panel.txt, main = "Predict price of wine")
# Linear Regression (one variable)
model1 = lm(Price ~ AGST, data = wine)
summary(model1)
##
## Call:
## lm(formula = Price ~ AGST, data = wine)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.7845 -0.2388 -0.0373 0.3899 0.9032
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.418 2.494 -1.37 0.18371
## AGST 0.635 0.151 4.21 0.00034 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.499 on 23 degrees of freedom
## Multiple R-squared: 0.435, Adjusted R-squared: 0.41
## F-statistic: 17.7 on 1 and 23 DF, p-value: 0.000335
# Sum of Squared Errors
model1$residuals
## 1 2 3 4 5 6 7 8
## 0.04204 0.82984 0.21169 0.15609 -0.23119 0.38992 -0.48959 0.90318
## 9 10 11 12 13 14 15 16
## 0.45372 0.14887 -0.23882 -0.08974 0.66186 -0.05212 -0.62727 -0.74715
## 17 18 19 20 21 22 23 24
## 0.42114 -0.03727 0.10685 -0.78450 -0.64018 -0.05509 -0.67055 -0.22040
## 25
## 0.55867
SSE = sum(model1$residuals^2)
SSE
## [1] 5.735
# Linear Regression (two variables)
model2 = lm(Price ~ AGST + HarvestRain, data = wine)
summary(model2)
##
## Call:
## lm(formula = Price ~ AGST + HarvestRain, data = wine)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.8832 -0.1960 0.0618 0.1538 0.5972
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.20265 1.85443 -1.19 0.24759
## AGST 0.60262 0.11128 5.42 1.9e-05 ***
## HarvestRain -0.00457 0.00101 -4.52 0.00017 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.367 on 22 degrees of freedom
## Multiple R-squared: 0.707, Adjusted R-squared: 0.681
## F-statistic: 26.6 on 2 and 22 DF, p-value: 1.35e-06
# Sum of Squared Errors
SSE = sum(model2$residuals^2)
SSE
## [1] 2.97
# Linear Regression (all variables)
model3 = lm(Price ~ AGST + HarvestRain + WinterRain + Age + FrancePop, data = wine)
summary(model3)
##
## Call:
## lm(formula = Price ~ AGST + HarvestRain + WinterRain + Age +
## FrancePop, data = wine)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.4818 -0.2466 -0.0073 0.2201 0.5199
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.50e-01 1.02e+01 -0.04 0.96520
## AGST 6.01e-01 1.03e-01 5.84 1.3e-05 ***
## HarvestRain -3.96e-03 8.75e-04 -4.52 0.00023 ***
## WinterRain 1.04e-03 5.31e-04 1.96 0.06442 .
## Age 5.85e-04 7.90e-02 0.01 0.99417
## FrancePop -4.95e-05 1.67e-04 -0.30 0.76958
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.302 on 19 degrees of freedom
## Multiple R-squared: 0.829, Adjusted R-squared: 0.784
## F-statistic: 18.5 on 5 and 19 DF, p-value: 1.04e-06
# Sum of Squared Errors
SSE = sum(model3$residuals^2)
SSE
## [1] 1.732
######
# Remove FrancePop
model4 = lm(Price ~ AGST + HarvestRain + WinterRain + Age, data = wine)
summary(model4)
##
## Call:
## lm(formula = Price ~ AGST + HarvestRain + WinterRain + Age, data = wine)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.4547 -0.2427 0.0075 0.1977 0.5364
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.429980 1.765898 -1.94 0.06631 .
## AGST 0.607209 0.098702 6.15 5.2e-06 ***
## HarvestRain -0.003972 0.000854 -4.65 0.00015 ***
## WinterRain 0.001076 0.000507 2.12 0.04669 *
## Age 0.023931 0.008097 2.96 0.00782 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.295 on 20 degrees of freedom
## Multiple R-squared: 0.829, Adjusted R-squared: 0.794
## F-statistic: 24.2 on 4 and 20 DF, p-value: 2.04e-07
# Correlations
cor(wine$WinterRain, wine$Price)
## [1] 0.1367
cor(wine$Age, wine$FrancePop)
## [1] -0.9945
cor(wine)
## Year Price WinterRain AGST HarvestRain Age
## Year 1.00000 -0.4478 0.016970 -0.2469 0.02801 -1.00000
## Price -0.44777 1.0000 0.136651 0.6596 -0.56332 0.44777
## WinterRain 0.01697 0.1367 1.000000 -0.3211 -0.27544 -0.01697
## AGST -0.24692 0.6596 -0.321091 1.0000 -0.06450 0.24692
## HarvestRain 0.02801 -0.5633 -0.275441 -0.0645 1.00000 -0.02801
## Age -1.00000 0.4478 -0.016970 0.2469 -0.02801 1.00000
## FrancePop 0.99449 -0.4669 -0.001622 -0.2592 0.04126 -0.99449
## FrancePop
## Year 0.994485
## Price -0.466862
## WinterRain -0.001622
## AGST -0.259162
## HarvestRain 0.041264
## Age -0.994485
## FrancePop 1.000000
# Remove Age and FrancePop
model5 = lm(Price ~ AGST + HarvestRain + WinterRain, data = wine)
summary(model5)
##
## Call:
## lm(formula = Price ~ AGST + HarvestRain + WinterRain, data = wine)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.6747 -0.1296 0.0197 0.2075 0.6385
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.301626 2.036674 -2.11 0.04683 *
## AGST 0.681024 0.111701 6.10 4.7e-06 ***
## HarvestRain -0.003948 0.000999 -3.95 0.00073 ***
## WinterRain 0.001177 0.000592 1.99 0.06010 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.345 on 21 degrees of freedom
## Multiple R-squared: 0.754, Adjusted R-squared: 0.719
## F-statistic: 21.4 on 3 and 21 DF, p-value: 1.36e-06
# Apply model on test set
# Read in test set
wineTest = read.csv("wine_test.csv")
str(wineTest)
## 'data.frame': 2 obs. of 7 variables:
## $ Year : int 1979 1980
## $ Price : num 6.95 6.5
## $ WinterRain : int 717 578
## $ AGST : num 16.2 16
## $ HarvestRain: int 122 74
## $ Age : int 4 3
## $ FrancePop : num 54836 55110
# Make test set predictions
predictTest = predict(model4, newdata = wineTest)
predictTest
## 1 2
## 6.769 6.685
# Compute R-squared
SSE = sum((wineTest$Price - predictTest)^2)
SST = sum((wineTest$Price - mean(wine$Price))^2)
1 - SSE/SST
## [1] 0.7944