# Linear Regression predicting price of wine

# Read in data
wine = read.csv("wine.csv")
str(wine)
## 'data.frame':    25 obs. of  7 variables:
##  $ Year       : int  1952 1953 1955 1957 1958 1959 1960 1961 1962 1963 ...
##  $ Price      : num  7.5 8.04 7.69 6.98 6.78 ...
##  $ WinterRain : int  600 690 502 420 582 485 763 830 697 608 ...
##  $ AGST       : num  17.1 16.7 17.1 16.1 16.4 ...
##  $ HarvestRain: int  160 80 130 110 187 187 290 38 52 155 ...
##  $ Age        : int  31 30 28 26 25 24 23 22 21 20 ...
##  $ FrancePop  : num  43184 43495 44218 45152 45654 ...
summary(wine)
##       Year          Price        WinterRain       AGST       HarvestRain 
##  Min.   :1952   Min.   :6.21   Min.   :376   Min.   :15.0   Min.   : 38  
##  1st Qu.:1960   1st Qu.:6.52   1st Qu.:536   1st Qu.:16.2   1st Qu.: 89  
##  Median :1966   Median :7.12   Median :600   Median :16.5   Median :130  
##  Mean   :1966   Mean   :7.07   Mean   :605   Mean   :16.5   Mean   :149  
##  3rd Qu.:1972   3rd Qu.:7.50   3rd Qu.:697   3rd Qu.:17.1   3rd Qu.:187  
##  Max.   :1978   Max.   :8.49   Max.   :830   Max.   :17.6   Max.   :292  
##       Age         FrancePop    
##  Min.   : 5.0   Min.   :43184  
##  1st Qu.:11.0   1st Qu.:46584  
##  Median :17.0   Median :50255  
##  Mean   :17.2   Mean   :49694  
##  3rd Qu.:23.0   3rd Qu.:52894  
##  Max.   :31.0   Max.   :54602

# examine correlations

cor(wine)
##                 Year   Price WinterRain    AGST HarvestRain      Age
## Year         1.00000 -0.4478   0.016970 -0.2469     0.02801 -1.00000
## Price       -0.44777  1.0000   0.136651  0.6596    -0.56332  0.44777
## WinterRain   0.01697  0.1367   1.000000 -0.3211    -0.27544 -0.01697
## AGST        -0.24692  0.6596  -0.321091  1.0000    -0.06450  0.24692
## HarvestRain  0.02801 -0.5633  -0.275441 -0.0645     1.00000 -0.02801
## Age         -1.00000  0.4478  -0.016970  0.2469    -0.02801  1.00000
## FrancePop    0.99449 -0.4669  -0.001622 -0.2592     0.04126 -0.99449
##             FrancePop
## Year         0.994485
## Price       -0.466862
## WinterRain  -0.001622
## AGST        -0.259162
## HarvestRain  0.041264
## Age         -0.994485
## FrancePop    1.000000
round(cor(wine), 2)
##              Year Price WinterRain  AGST HarvestRain   Age FrancePop
## Year         1.00 -0.45       0.02 -0.25        0.03 -1.00      0.99
## Price       -0.45  1.00       0.14  0.66       -0.56  0.45     -0.47
## WinterRain   0.02  0.14       1.00 -0.32       -0.28 -0.02      0.00
## AGST        -0.25  0.66      -0.32  1.00       -0.06  0.25     -0.26
## HarvestRain  0.03 -0.56      -0.28 -0.06        1.00 -0.03      0.04
## Age         -1.00  0.45      -0.02  0.25       -0.03  1.00     -0.99
## FrancePop    0.99 -0.47       0.00 -0.26        0.04 -0.99      1.00

wineReordered = wine[c("Price", "Year", "WinterRain", "AGST", "HarvestRain", 
    "Age", "FrancePop")]
round(cor(wineReordered), 2)
##             Price  Year WinterRain  AGST HarvestRain   Age FrancePop
## Price        1.00 -0.45       0.14  0.66       -0.56  0.45     -0.47
## Year        -0.45  1.00       0.02 -0.25        0.03 -1.00      0.99
## WinterRain   0.14  0.02       1.00 -0.32       -0.28 -0.02      0.00
## AGST         0.66 -0.25      -0.32  1.00       -0.06  0.25     -0.26
## HarvestRain -0.56  0.03      -0.28 -0.06        1.00 -0.03      0.04
## Age          0.45 -1.00      -0.02  0.25       -0.03  1.00     -0.99
## FrancePop   -0.47  0.99       0.00 -0.26        0.04 -0.99      1.00

pairs(wine)

## Correlogram example
install.packages("corrgram")
## Installing package into '/Applications/RStudio.app/Contents/Resources/R/library'
## (as 'lib' is unspecified)
## Error: trying to use CRAN without setting a mirror
library(corrgram)
## Loading required package: seriation

plot of chunk unnamed-chunk-1

corrgram(wineReordered, order = TRUE, lower.panel = panel.shade, upper.panel = panel.pie, 
    text.panel = panel.txt, main = "Predict price of wine")

plot of chunk unnamed-chunk-1



# Linear Regression (one variable)
model1 = lm(Price ~ AGST, data = wine)
summary(model1)
## 
## Call:
## lm(formula = Price ~ AGST, data = wine)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.7845 -0.2388 -0.0373  0.3899  0.9032 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -3.418      2.494   -1.37  0.18371    
## AGST           0.635      0.151    4.21  0.00034 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.499 on 23 degrees of freedom
## Multiple R-squared:  0.435,  Adjusted R-squared:  0.41 
## F-statistic: 17.7 on 1 and 23 DF,  p-value: 0.000335

# Sum of Squared Errors
model1$residuals
##        1        2        3        4        5        6        7        8 
##  0.04204  0.82984  0.21169  0.15609 -0.23119  0.38992 -0.48959  0.90318 
##        9       10       11       12       13       14       15       16 
##  0.45372  0.14887 -0.23882 -0.08974  0.66186 -0.05212 -0.62727 -0.74715 
##       17       18       19       20       21       22       23       24 
##  0.42114 -0.03727  0.10685 -0.78450 -0.64018 -0.05509 -0.67055 -0.22040 
##       25 
##  0.55867
SSE = sum(model1$residuals^2)
SSE
## [1] 5.735

# Linear Regression (two variables)
model2 = lm(Price ~ AGST + HarvestRain, data = wine)
summary(model2)
## 
## Call:
## lm(formula = Price ~ AGST + HarvestRain, data = wine)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.8832 -0.1960  0.0618  0.1538  0.5972 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.20265    1.85443   -1.19  0.24759    
## AGST         0.60262    0.11128    5.42  1.9e-05 ***
## HarvestRain -0.00457    0.00101   -4.52  0.00017 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.367 on 22 degrees of freedom
## Multiple R-squared:  0.707,  Adjusted R-squared:  0.681 
## F-statistic: 26.6 on 2 and 22 DF,  p-value: 1.35e-06

# Sum of Squared Errors
SSE = sum(model2$residuals^2)
SSE
## [1] 2.97

# Linear Regression (all variables)
model3 = lm(Price ~ AGST + HarvestRain + WinterRain + Age + FrancePop, data = wine)
summary(model3)
## 
## Call:
## lm(formula = Price ~ AGST + HarvestRain + WinterRain + Age + 
##     FrancePop, data = wine)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.4818 -0.2466 -0.0073  0.2201  0.5199 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -4.50e-01   1.02e+01   -0.04  0.96520    
## AGST         6.01e-01   1.03e-01    5.84  1.3e-05 ***
## HarvestRain -3.96e-03   8.75e-04   -4.52  0.00023 ***
## WinterRain   1.04e-03   5.31e-04    1.96  0.06442 .  
## Age          5.85e-04   7.90e-02    0.01  0.99417    
## FrancePop   -4.95e-05   1.67e-04   -0.30  0.76958    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.302 on 19 degrees of freedom
## Multiple R-squared:  0.829,  Adjusted R-squared:  0.784 
## F-statistic: 18.5 on 5 and 19 DF,  p-value: 1.04e-06

# Sum of Squared Errors
SSE = sum(model3$residuals^2)
SSE
## [1] 1.732
###### 

# Remove FrancePop
model4 = lm(Price ~ AGST + HarvestRain + WinterRain + Age, data = wine)
summary(model4)
## 
## Call:
## lm(formula = Price ~ AGST + HarvestRain + WinterRain + Age, data = wine)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.4547 -0.2427  0.0075  0.1977  0.5364 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -3.429980   1.765898   -1.94  0.06631 .  
## AGST         0.607209   0.098702    6.15  5.2e-06 ***
## HarvestRain -0.003972   0.000854   -4.65  0.00015 ***
## WinterRain   0.001076   0.000507    2.12  0.04669 *  
## Age          0.023931   0.008097    2.96  0.00782 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.295 on 20 degrees of freedom
## Multiple R-squared:  0.829,  Adjusted R-squared:  0.794 
## F-statistic: 24.2 on 4 and 20 DF,  p-value: 2.04e-07

# Correlations
cor(wine$WinterRain, wine$Price)
## [1] 0.1367
cor(wine$Age, wine$FrancePop)
## [1] -0.9945
cor(wine)
##                 Year   Price WinterRain    AGST HarvestRain      Age
## Year         1.00000 -0.4478   0.016970 -0.2469     0.02801 -1.00000
## Price       -0.44777  1.0000   0.136651  0.6596    -0.56332  0.44777
## WinterRain   0.01697  0.1367   1.000000 -0.3211    -0.27544 -0.01697
## AGST        -0.24692  0.6596  -0.321091  1.0000    -0.06450  0.24692
## HarvestRain  0.02801 -0.5633  -0.275441 -0.0645     1.00000 -0.02801
## Age         -1.00000  0.4478  -0.016970  0.2469    -0.02801  1.00000
## FrancePop    0.99449 -0.4669  -0.001622 -0.2592     0.04126 -0.99449
##             FrancePop
## Year         0.994485
## Price       -0.466862
## WinterRain  -0.001622
## AGST        -0.259162
## HarvestRain  0.041264
## Age         -0.994485
## FrancePop    1.000000

# Remove Age and FrancePop
model5 = lm(Price ~ AGST + HarvestRain + WinterRain, data = wine)
summary(model5)
## 
## Call:
## lm(formula = Price ~ AGST + HarvestRain + WinterRain, data = wine)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.6747 -0.1296  0.0197  0.2075  0.6385 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -4.301626   2.036674   -2.11  0.04683 *  
## AGST         0.681024   0.111701    6.10  4.7e-06 ***
## HarvestRain -0.003948   0.000999   -3.95  0.00073 ***
## WinterRain   0.001177   0.000592    1.99  0.06010 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.345 on 21 degrees of freedom
## Multiple R-squared:  0.754,  Adjusted R-squared:  0.719 
## F-statistic: 21.4 on 3 and 21 DF,  p-value: 1.36e-06


# Apply model on test set

# Read in test set
wineTest = read.csv("wine_test.csv")
str(wineTest)
## 'data.frame':    2 obs. of  7 variables:
##  $ Year       : int  1979 1980
##  $ Price      : num  6.95 6.5
##  $ WinterRain : int  717 578
##  $ AGST       : num  16.2 16
##  $ HarvestRain: int  122 74
##  $ Age        : int  4 3
##  $ FrancePop  : num  54836 55110

# Make test set predictions
predictTest = predict(model4, newdata = wineTest)
predictTest
##     1     2 
## 6.769 6.685

# Compute R-squared
SSE = sum((wineTest$Price - predictTest)^2)
SST = sum((wineTest$Price - mean(wine$Price))^2)
1 - SSE/SST
## [1] 0.7944