This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.
Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Ctrl+Shift+Enter.
data1<- read.csv(file.choose())
summary(data1)
## ï..Price County Size Elevation
## Min. : 1.70 Min. :0.0000 Min. : 6.90 Min. : 0.000
## 1st Qu.: 5.35 1st Qu.:0.0000 1st Qu.: 20.35 1st Qu.: 2.000
## Median :11.70 Median :1.0000 Median : 51.40 Median : 4.000
## Mean :11.95 Mean :0.6129 Mean : 139.97 Mean : 4.645
## 3rd Qu.:16.05 3rd Qu.:1.0000 3rd Qu.: 104.10 3rd Qu.: 7.000
## Max. :37.20 Max. :1.0000 Max. :1695.20 Max. :20.000
## Sewer Date Flood Distance
## Min. : 0 Min. :-103.00 Min. :0.0000 Min. : 0.000
## 1st Qu.: 0 1st Qu.: -63.50 1st Qu.:0.0000 1st Qu.: 0.850
## Median : 900 Median : -59.00 Median :0.0000 Median : 4.900
## Mean : 1981 Mean : -58.65 Mean :0.1613 Mean : 5.132
## 3rd Qu.: 3450 3rd Qu.: -51.00 3rd Qu.:0.0000 3rd Qu.: 5.500
## Max. :10000 Max. : -4.00 Max. :1.0000 Max. :16.500
#drawing a histogram to check how the price of plot is distributed
hist(data1$ï..Price)
#Histogram is skewed. Looks the average price of the plot could be around 10k/acre
#but to have a better histogram; lets take a log to base 10 for Price.
#just transforming the price factor
logprice <- log(data1$ï..Price)
hist(logprice, main= "History of Price taking log")
#taking lof of price helps to have a better histogram
#calling libraries
library(car)
## Warning: package 'car' was built under R version 3.4.2
library(corrplot)
## Warning: package 'corrplot' was built under R version 3.4.2
## corrplot 0.84 loaded
library(visreg)
## Warning: package 'visreg' was built under R version 3.4.2
library(rgl)
## Warning: package 'rgl' was built under R version 3.4.2
library(knitr)
## Warning: package 'knitr' was built under R version 3.4.2
library(scatterplot3d) #for 3Dfigures
library(GGally)
## Warning: package 'GGally' was built under R version 3.4.2
#drawing a scatter plot to check how the correlation is
#distributed in the group.
#Price is corelated to distance, elevation, sewer, date.
plot(data1, pch=10, col="blue")
ggpairs(data1)
#Lets draw a correlation plot in numbers
cordata1 <- cor(data1)
corrplot(cordata1, method= "number")
#there is some correlation between independant variables
#distance & country; flood & country; elevation & country
#drawing a scatter plot
?scatterplot
## starting httpd help server ...
## done
scatterplot(data1$ï..Price, data1$Date, main="Price vs Date over time")
scatterplot(data1$ï..Price, data1$Size)
scatterplot(data1$ï..Price, data1$Elevation, main="Price Vs Elevation")
scatterplot(data1$ï..Price, data1$Distance)
#just to see how the data is correlated
attach(data1)
set.seed(1)
# Center predictors.
elevation.c <- scale(Elevation, center = T, scale = F)
date.c <- scale(Date, center = T, scale = F)
flood.c <- scale(Flood, center = T, scale = F)
distance.c <- scale(Distance, center = T, scale = F)
#just to try if center predictor will be of help
# bind these new variables into newdata and display a summary.
new.c.vars <- cbind(elevation.c, date.c, flood.c, distance.c)
data2 <- cbind(data1, new.c.vars)
data2
## ï..Price County Size Elevation Sewer Date Flood Distance 1
## 1 4.5 1 138.4 10 3000 -103 0 0.3 5.3548387
## 2 10.6 1 52.0 4 0 -103 0 2.5 -0.6451613
## 3 1.7 0 16.1 0 2640 -98 1 10.3 -4.6451613
## 4 5.0 0 1695.2 1 3500 -93 0 14.0 -3.6451613
## 5 5.0 0 845.0 1 1000 -92 1 14.0 -3.6451613
## 6 3.3 1 6.9 2 10000 -86 0 0.0 -2.6451613
## 7 5.7 1 105.9 4 0 -68 0 0.0 -0.6451613
## 8 6.2 1 56.6 4 0 -64 0 0.0 -0.6451613
## 9 19.4 1 51.4 20 1300 -63 0 1.2 15.3548387
## 10 3.2 1 22.1 0 6000 -62 0 0.0 -4.6451613
## 11 4.7 1 22.1 0 6000 -61 0 0.0 -4.6451613
## 12 6.9 1 27.7 3 4500 -60 0 0.0 -1.6451613
## 13 8.1 1 18.6 5 5000 -59 0 0.5 0.3548387
## 14 11.6 1 69.9 8 0 -59 0 4.4 3.3548387
## 15 19.3 1 145.7 10 0 -59 0 4.2 5.3548387
## 16 11.7 1 77.2 9 0 -59 0 4.5 4.3548387
## 17 13.3 1 26.2 8 0 -59 0 4.7 3.3548387
## 18 15.1 1 102.3 6 0 -59 0 4.9 1.3548387
## 19 12.4 1 49.5 11 0 -59 0 4.6 6.3548387
## 20 15.3 1 12.2 8 0 -59 0 5.0 3.3548387
## 21 12.2 0 320.6 0 4000 -54 0 16.5 -4.6451613
## 22 18.1 1 9.9 5 0 -54 0 5.2 0.3548387
## 23 16.8 1 15.3 2 0 -53 0 5.5 -2.6451613
## 24 5.9 0 55.2 0 1320 -49 1 11.9 -4.6451613
## 25 4.0 0 116.2 2 900 -45 1 5.5 -2.6451613
## 26 37.2 0 15.0 5 0 -39 0 7.2 0.3548387
## 27 18.2 0 23.4 5 4420 -39 0 5.5 0.3548387
## 28 15.1 0 132.8 2 2640 -35 0 10.2 -2.6451613
## 29 22.9 0 12.0 5 3400 -16 0 5.5 0.3548387
## 30 15.2 0 67.0 2 900 -5 1 5.5 -2.6451613
## 31 21.9 0 30.8 2 900 -4 0 5.5 -2.6451613
## 2 3 4
## 1 -44.3548387 -0.1612903 -4.83225806
## 2 -44.3548387 -0.1612903 -2.63225806
## 3 -39.3548387 0.8387097 5.16774194
## 4 -34.3548387 -0.1612903 8.86774194
## 5 -33.3548387 0.8387097 8.86774194
## 6 -27.3548387 -0.1612903 -5.13225806
## 7 -9.3548387 -0.1612903 -5.13225806
## 8 -5.3548387 -0.1612903 -5.13225806
## 9 -4.3548387 -0.1612903 -3.93225806
## 10 -3.3548387 -0.1612903 -5.13225806
## 11 -2.3548387 -0.1612903 -5.13225806
## 12 -1.3548387 -0.1612903 -5.13225806
## 13 -0.3548387 -0.1612903 -4.63225806
## 14 -0.3548387 -0.1612903 -0.73225806
## 15 -0.3548387 -0.1612903 -0.93225806
## 16 -0.3548387 -0.1612903 -0.63225806
## 17 -0.3548387 -0.1612903 -0.43225806
## 18 -0.3548387 -0.1612903 -0.23225806
## 19 -0.3548387 -0.1612903 -0.53225806
## 20 -0.3548387 -0.1612903 -0.13225806
## 21 4.6451613 -0.1612903 11.36774194
## 22 4.6451613 -0.1612903 0.06774194
## 23 5.6451613 -0.1612903 0.36774194
## 24 9.6451613 0.8387097 6.76774194
## 25 13.6451613 0.8387097 0.36774194
## 26 19.6451613 -0.1612903 2.06774194
## 27 19.6451613 -0.1612903 0.36774194
## 28 23.6451613 -0.1612903 5.06774194
## 29 42.6451613 -0.1612903 0.36774194
## 30 53.6451613 0.8387097 0.36774194
## 31 54.6451613 -0.1612903 0.36774194
names(data2)[9:12] <- c("elevation.c", "date.c", "flood.c", "distance.c" )
#Model1 - plain comparing with independent variables
mod1 <- lm(data2$ï..Price~ elevation.c+date.c+flood.c+distance.c)
summary(mod1)
##
## Call:
## lm(formula = data2$ï..Price ~ elevation.c + date.c + flood.c +
## distance.c)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.311 -2.954 -1.040 1.273 18.915
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.9516 0.8958 13.342 3.86e-13 ***
## elevation.c 0.6861 0.2326 2.950 0.00664 **
## date.c 0.1907 0.0372 5.125 2.41e-05 ***
## flood.c -6.9243 2.7845 -2.487 0.01964 *
## distance.c 0.5933 0.2285 2.596 0.01531 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.988 on 26 degrees of freedom
## Multiple R-squared: 0.6378, Adjusted R-squared: 0.582
## F-statistic: 11.44 on 4 and 26 DF, p-value: 1.718e-05
#Rsquare is 0.582
#taking log of income
mod2 <- lm(log(data2$ï..Price)~ elevation.c+date.c+flood.c+distance.c)
summary(mod2)
##
## Call:
## lm(formula = log(data2$ï..Price) ~ elevation.c + date.c + flood.c +
## distance.c)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.5099 -0.2090 -0.1097 0.1721 1.0031
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.25944 0.06478 34.879 < 2e-16 ***
## elevation.c 0.07459 0.01682 4.435 0.000149 ***
## date.c 0.01857 0.00269 6.902 2.5e-07 ***
## flood.c -0.77886 0.20136 -3.868 0.000659 ***
## distance.c 0.05908 0.01653 3.575 0.001401 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3607 on 26 degrees of freedom
## Multiple R-squared: 0.7806, Adjusted R-squared: 0.7468
## F-statistic: 23.12 on 4 and 26 DF, p-value: 3.049e-08
#increased R square to 0.746. Taking log of price is giving a better model
#testing without center predictors
mod3 <- lm(data2$ï..Price~ Elevation+ Date+ Flood+Distance)
summary(mod3)
##
## Call:
## lm(formula = data2$ï..Price ~ Elevation + Date + Flood + Distance)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.311 -2.954 -1.040 1.273 18.915
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 18.0191 2.9536 6.101 1.90e-06 ***
## Elevation 0.6861 0.2326 2.950 0.00664 **
## Date 0.1907 0.0372 5.125 2.41e-05 ***
## Flood -6.9243 2.7845 -2.487 0.01964 *
## Distance 0.5933 0.2285 2.596 0.01531 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.988 on 26 degrees of freedom
## Multiple R-squared: 0.6378, Adjusted R-squared: 0.582
## F-statistic: 11.44 on 4 and 26 DF, p-value: 1.718e-05
#R square is 0.58
#removing log reduced the R square
plot(mod2, pch=10, which = 1)
dwt(mod2)
## lag Autocorrelation D-W Statistic p-value
## 1 -0.1952503 2.37622 0.398
## Alternative hypothesis: rho != 0
??dwtest
qqPlot(mod2)
outlierTest(mod2)
## rstudent unadjusted p-value Bonferonni p
## 2 3.704906 0.0010527 0.032634
help("outlier.test")
#Lets take all the factors and see which has more P value
mod4 <- lm(data2$ï..Price ~ ., data=data1)
summary(mod4)
##
## Call:
## lm(formula = data2$ï..Price ~ ., data = data1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.169 -2.957 -0.256 2.070 13.031
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.364e+01 3.829e+00 6.174 2.68e-06 ***
## County -8.789e+00 3.652e+00 -2.407 0.024532 *
## Size -6.043e-03 3.501e-03 -1.726 0.097702 .
## Elevation 5.193e-01 2.386e-01 2.177 0.040030 *
## Sewer -9.573e-04 4.169e-04 -2.296 0.031126 *
## Date 8.508e-02 4.865e-02 1.749 0.093646 .
## Flood -1.202e+01 2.989e+00 -4.020 0.000536 ***
## Distance 1.858e-01 3.395e-01 0.547 0.589386
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.431 on 23 degrees of freedom
## Multiple R-squared: 0.747, Adjusted R-squared: 0.67
## F-statistic: 9.703 on 7 and 23 DF, p-value: 1.351e-05
#We will take the variables for which the pvalue is less than 0.05
#looks like distance & size have more than 0.05 p value.
#R square is 0.747. Therefore, our error is more than 1/4
#lets try with a different value eliminating the distance & size
#and see if our R square increases
mod5 <- lm(data1$ï..Price ~ . -Distance - Size, data=data1)
summary(mod5)
##
## Call:
## lm(formula = data1$ï..Price ~ . - Distance - Size, data = data1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.5688 -2.7883 -0.3453 1.9312 14.4498
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.497e+01 2.597e+00 9.614 7.05e-10 ***
## County -7.439e+00 2.383e+00 -3.122 0.00450 **
## Elevation 5.291e-01 2.397e-01 2.207 0.03671 *
## Sewer -9.513e-04 3.800e-04 -2.504 0.01919 *
## Date 1.247e-01 3.840e-02 3.249 0.00330 **
## Flood -1.064e+01 2.871e+00 -3.707 0.00105 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.518 on 25 degrees of freedom
## Multiple R-squared: 0.7143, Adjusted R-squared: 0.6571
## F-statistic: 12.5 on 5 and 25 DF, p-value: 3.831e-06
#R square of 0.657. Lets see if this can be slightly improved
?par
par(mfrow= c(2,2))
plot(mod5)
#26 row seems to be an outlier; lets try removing this row 26 alone
#to increase our model
data3 <-data1[-26,]
data3
## ï..Price County Size Elevation Sewer Date Flood Distance
## 1 4.5 1 138.4 10 3000 -103 0 0.3
## 2 10.6 1 52.0 4 0 -103 0 2.5
## 3 1.7 0 16.1 0 2640 -98 1 10.3
## 4 5.0 0 1695.2 1 3500 -93 0 14.0
## 5 5.0 0 845.0 1 1000 -92 1 14.0
## 6 3.3 1 6.9 2 10000 -86 0 0.0
## 7 5.7 1 105.9 4 0 -68 0 0.0
## 8 6.2 1 56.6 4 0 -64 0 0.0
## 9 19.4 1 51.4 20 1300 -63 0 1.2
## 10 3.2 1 22.1 0 6000 -62 0 0.0
## 11 4.7 1 22.1 0 6000 -61 0 0.0
## 12 6.9 1 27.7 3 4500 -60 0 0.0
## 13 8.1 1 18.6 5 5000 -59 0 0.5
## 14 11.6 1 69.9 8 0 -59 0 4.4
## 15 19.3 1 145.7 10 0 -59 0 4.2
## 16 11.7 1 77.2 9 0 -59 0 4.5
## 17 13.3 1 26.2 8 0 -59 0 4.7
## 18 15.1 1 102.3 6 0 -59 0 4.9
## 19 12.4 1 49.5 11 0 -59 0 4.6
## 20 15.3 1 12.2 8 0 -59 0 5.0
## 21 12.2 0 320.6 0 4000 -54 0 16.5
## 22 18.1 1 9.9 5 0 -54 0 5.2
## 23 16.8 1 15.3 2 0 -53 0 5.5
## 24 5.9 0 55.2 0 1320 -49 1 11.9
## 25 4.0 0 116.2 2 900 -45 1 5.5
## 27 18.2 0 23.4 5 4420 -39 0 5.5
## 28 15.1 0 132.8 2 2640 -35 0 10.2
## 29 22.9 0 12.0 5 3400 -16 0 5.5
## 30 15.2 0 67.0 2 900 -5 1 5.5
## 31 21.9 0 30.8 2 900 -4 0 5.5
str(data3)
## 'data.frame': 30 obs. of 8 variables:
## $ ï..Price : num 4.5 10.6 1.7 5 5 3.3 5.7 6.2 19.4 3.2 ...
## $ County : int 1 1 0 0 0 1 1 1 1 1 ...
## $ Size : num 138.4 52 16.1 1695.2 845 ...
## $ Elevation: int 10 4 0 1 1 2 4 4 20 0 ...
## $ Sewer : int 3000 0 2640 3500 1000 10000 0 0 1300 6000 ...
## $ Date : int -103 -103 -98 -93 -92 -86 -68 -64 -63 -62 ...
## $ Flood : int 0 0 1 0 1 0 0 0 0 0 ...
## $ Distance : num 0.3 2.5 10.3 14 14 0 0 0 1.2 0 ...
str(data1)
## 'data.frame': 31 obs. of 8 variables:
## $ ï..Price : num 4.5 10.6 1.7 5 5 3.3 5.7 6.2 19.4 3.2 ...
## $ County : int 1 1 0 0 0 1 1 1 1 1 ...
## $ Size : num 138.4 52 16.1 1695.2 845 ...
## $ Elevation: int 10 4 0 1 1 2 4 4 20 0 ...
## $ Sewer : int 3000 0 2640 3500 1000 10000 0 0 1300 6000 ...
## $ Date : int -103 -103 -98 -93 -92 -86 -68 -64 -63 -62 ...
## $ Flood : int 0 0 1 0 1 0 0 0 0 0 ...
## $ Distance : num 0.3 2.5 10.3 14 14 0 0 0 1.2 0 ...
mod6 <- lm(log(ï..Price) ~ . -Distance - Size, data= data3)
summary(mod6)
##
## Call:
## lm(formula = log(ï..Price) ~ . - Distance - Size, data = data3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.54690 -0.21040 0.01803 0.23982 0.56446
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.419e+00 2.005e-01 17.056 6.42e-15 ***
## County -3.592e-01 1.857e-01 -1.934 0.065025 .
## Elevation 4.525e-02 1.763e-02 2.567 0.016920 *
## Sewer -9.915e-05 2.848e-05 -3.482 0.001926 **
## Date 1.403e-02 2.825e-03 4.965 4.54e-05 ***
## Flood -9.153e-01 2.198e-01 -4.164 0.000347 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3321 on 24 degrees of freedom
## Multiple R-squared: 0.8041, Adjusted R-squared: 0.7633
## F-statistic: 19.7 on 5 and 24 DF, p-value: 8.585e-08
#We have an improved R Square of 0.763. The looks like a better model
#trying it with step function.
step(lm(ï..Price ~ ., data=data3))
## Start: AIC=75.45
## ï..Price ~ County + Size + Elevation + Sewer + Date + Flood +
## Distance
##
## Df Sum of Sq RSS AIC
## - County 1 8.266 225.90 74.566
## <none> 217.63 75.448
## - Size 1 18.032 235.66 75.836
## - Sewer 1 26.544 244.17 76.901
## - Distance 1 27.596 245.23 77.030
## - Elevation 1 100.809 318.44 84.867
## - Flood 1 116.091 333.72 86.273
## - Date 1 127.501 345.13 87.282
##
## Step: AIC=74.57
## ï..Price ~ Size + Elevation + Sewer + Date + Flood + Distance
##
## Df Sum of Sq RSS AIC
## - Size 1 12.890 238.79 74.231
## <none> 225.90 74.566
## - Sewer 1 18.645 244.54 74.946
## - Distance 1 85.238 311.14 82.171
## - Elevation 1 98.595 324.49 83.432
## - Flood 1 126.247 352.14 85.885
## - Date 1 308.603 534.50 98.404
##
## Step: AIC=74.23
## ï..Price ~ Elevation + Sewer + Date + Flood + Distance
##
## Df Sum of Sq RSS AIC
## <none> 238.79 74.231
## - Sewer 1 20.87 259.66 74.745
## - Distance 1 78.70 317.48 80.777
## - Elevation 1 101.70 340.49 82.875
## - Flood 1 115.21 354.00 84.043
## - Date 1 451.51 690.29 104.078
##
## Call:
## lm(formula = ï..Price ~ Elevation + Sewer + Date + Flood + Distance,
## data = data3)
##
## Coefficients:
## (Intercept) Elevation Sewer Date Flood
## 17.9571991 0.5422328 -0.0004005 0.1627357 -6.1840320
## Distance
## 0.4229008
mod9 <- lm(ï..Price ~ . -Distance - Size, data= data3)
summary(mod9)
##
## Call:
## lm(formula = ï..Price ~ . - Distance - Size, data = data3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.0186 -2.2651 -0.3114 2.1549 5.1596
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 22.0187525 1.9634490 11.214 5.01e-11 ***
## County -4.4613706 1.8189990 -2.453 0.02183 *
## Elevation 0.5086667 0.1726287 2.947 0.00704 **
## Sewer -0.0006846 0.0002789 -2.455 0.02173 *
## Date 0.1308357 0.0276699 4.728 8.28e-05 ***
## Flood -7.6795702 2.1524916 -3.568 0.00156 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.252 on 24 degrees of freedom
## Multiple R-squared: 0.7747, Adjusted R-squared: 0.7278
## F-statistic: 16.51 on 5 and 24 DF, p-value: 4.372e-07
#We have an R square of 0.7278. since, we have a better results of mod6, we will choose mod6
?predict
lesiesaltprice <- predict(mod6)
lesiesaltprice
## 1 2 3 4 5 6 7
## 1.7704282 1.7963983 0.8676524 1.8130681 1.1596736 0.9528662 2.2873697
## 8 9 10 11 12 13 14
## 2.3434807 2.9525663 1.5956419 1.6096697 1.9081652 1.9631118 2.5946080
## 15 16 17 18 19 20 21
## 2.6851022 2.6398551 2.5946080 2.5041137 2.7303493 2.5946080 2.2653279
## 22 23 24 25 27 28 29
## 2.5290053 2.4072917 1.6858916 1.8741403 2.6603364 2.7571948 3.0841087
## 30 31
## 2.4352504 3.3645778
Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Ctrl+Alt+I.
When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Ctrl+Shift+K to preview the HTML file).