Loading required libraries
library(ggplot2)
library(MASS)
library(caret)
## Loading required package: lattice
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(matrixcalc)
Pick one of the quantitative independent variables (Xi) from the data set below, and define that variable as X. Also, pick one of the dependent variables (Yi) below, and define that as Y.
data<-read.csv("https://raw.githubusercontent.com/VioletaStoyanova/Data605/master/Problem1%20.csv", stringsAsFactors = FALSE)
data
## ï..Y1 Y2 Y3 Y4 X1 X2 X3 X4
## 1 20.3 20.8 28.4 20.2 9.3 7.4 9.5 9.3
## 2 19.1 14.6 21.5 18.6 4.1 6.4 3.7 12.4
## 3 19.3 18.0 20.8 22.6 22.4 8.5 11.7 19.9
## 4 20.9 7.3 22.2 11.4 9.1 9.5 7.4 6.9
## 5 22.0 19.4 21.6 23.6 15.8 11.8 5.3 -1.0
## 6 23.5 13.5 21.8 24.0 7.1 8.8 7.4 10.6
## 7 13.8 14.7 25.2 26.0 15.9 8.4 7.4 6.4
## 8 18.8 15.3 22.5 26.8 6.9 5.1 8.6 10.6
## 9 20.9 12.6 21.1 19.7 16.0 11.4 9.1 1.2
## 10 18.6 13.0 21.7 22.7 6.7 15.1 11.4 7.7
## 11 22.3 13.1 21.4 16.8 8.2 12.6 8.4 15.5
## 12 17.6 10.3 20.8 20.2 16.0 8.0 7.3 6.9
## 13 20.8 14.9 23.0 21.7 6.4 10.3 11.3 13.7
## 14 28.7 14.8 17.4 20.9 11.8 10.4 4.4 3.7
## 15 15.2 16.2 21.3 26.9 3.5 9.5 9.3 4.4
## 16 20.9 15.7 15.1 16.3 21.7 9.5 10.9 11.5
## 17 18.4 16.3 17.8 19.9 12.2 15.1 10.9 4.2
## 18 10.3 11.5 26.4 15.5 9.3 6.6 7.7 13.9
## 19 26.3 12.2 21.6 26.5 8.0 15.4 7.7 12.9
## 20 28.1 11.8 22.5 21.7 6.2 8.2 11.5 1.2
Calculate as a minimum the below probabilities a through c. Assume the small letter “x” is estimated as the 3d quartile of the X variable, and the small letter “y” is estimated as the 1st quartile of the Y variable. Interpret the meaning of all probabilities.
summary(data$X2)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.10 8.15 9.50 9.90 11.50 15.40
summary(data$Y2)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 7.30 12.50 14.65 14.30 15.82 20.80
#X2 is my x
#Y2 is my y
histogram(data$X2)
histogram(data$Y2)
a. P(X>x | Y>y) b. P(X>x, Y>y) c. P(X
(xQ3<-quantile(data$X2,0.75))
## 75%
## 11.5
Define the 1st quartile for the Y2 variable
(yQ1<-quantile(data$Y2,0.25))
## 25%
## 12.5
Formula for Conditional Probability
p(x|y)=p(x,y)/p(y) a.
numerator <- filter(data, Y2 > yQ1 & X2> xQ3)%>%
tally()/nrow(data)
denominator <- filter(data, Y2 > yQ1)%>%
tally()/nrow(data)
(a <- numerator/denominator)
## n
## 1 0.2666667
Xx <- filter(data, X2 > xQ3) %>% tally()/nrow(data)
Yy <- filter(data, Y2 > yQ1) %>% tally()/nrow(data)
(b <- Xx * Yy)
## n
## 1 0.1875
numerator <- filter(data, Y2 > yQ1 & X2 < xQ3) %>% tally()/nrow(data)
denominator <- filter(data, Y2 > yQ1) %>% tally()/nrow(data)
(c <- numerator/denominator)
## n
## 1 0.7333333
c1 <- nrow(subset(data, X2<=xQ3 & Y2<=yQ1))
c2 <- nrow(subset(data, X2 <=xQ3 & Y2>yQ1))
c3 <- c1+c2
c4 <- nrow(subset(data, X2 >xQ3 & Y2<=yQ1))
c5 <- nrow(subset(data, X2 >xQ3 & Y2>yQ1))
c6 <- c4+c5
c7 <- c1+c4
c8 <- c2+c5
c9 <- c3+c6
dfcont<-matrix(round(c(c1, c2, c3, c4, c5, c6, c7, c8, c9), 3), ncol=3, nrow=3, byrow=TRUE)
colnames(dfcont) <-c (
"Y<=y",
"Y>y",
"Total")
rownames(dfcont) <-c ("X<=x","X>x","Total")
(dfcont <- knitr::kable(as.table(dfcont)))
| Y<=y | Y>y | Total | |
|---|---|---|---|
| X<=x | 4 | 11 | 15 |
| X>x | 1 | 4 | 5 |
| Total | 5 | 15 | 20 |
mat <- matrix(c(4, 11, 1, 4), 2, 2, byrow=T)
chisq.test(mat, correct=TRUE)
## Warning in chisq.test(mat, correct = TRUE): Chi-squared approximation may
## be incorrect
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: mat
## X-squared = 0, df = 1, p-value = 1
Does splitting the training data in this fashion make them independent? Let A be the new variable counting those observations above the 1st quartile for X, and let B be the new variable counting those observations above the 1st quartile for Y. Does P(AB)=P(A)P(B)? Check mathematically, and then evaluate by running a Chi Square test for association.
(xQ1<-quantile(data$X2,0.25))
## 25%
## 8.15
c1 <- nrow(subset(data, X2<=xQ1 & Y2<=yQ1))
c2 <- nrow(subset(data, X2 <=xQ1 & Y2>yQ1))
c3 <- c1+c2
c4 <- nrow(subset(data, X2 >xQ1 & Y2<=yQ1))
c5 <- nrow(subset(data, X2 >xQ1 & Y2>yQ1))
c6 <- c4+c5
c7 <- c1+c4
c8 <- c2+c5
c9 <- c3+c6
dfcont<-matrix(round(c(c1, c2, c3, c4, c5, c6, c7, c8, c9), 3), ncol=3, nrow=3, byrow=TRUE)
colnames(dfcont) <-c (
"Y<=y",
"Y>y",
"Total")
rownames(dfcont) <-c ("X<=x","X>x","Total")
(dfcont <- knitr::kable(as.table(dfcont)))
| Y<=y | Y>y | Total | |
|---|---|---|---|
| X<=x | 2 | 3 | 5 |
| X>x | 3 | 12 | 15 |
| Total | 5 | 15 | 20 |
mat <- matrix(c(2, 3, 3, 12), 2, 2, byrow=T)
chisq.test(mat, correct=TRUE)
## Warning in chisq.test(mat, correct = TRUE): Chi-squared approximation may
## be incorrect
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: mat
## X-squared = 0.088889, df = 1, p-value = 0.7656
From our Chi square test we can see the p-value is larger than .05 which suggest to accept the H0 hypothesis, in other words the data is independent.
You are to register for Kaggle.com (free) and compete in the House Prices: Advanced Regression Techniques competition. https://www.kaggle.com/c/house-prices-advanced-regression-techniques . I want you to do the following. 5 points. Descriptive and Inferential Statistics. Provide univariate descriptive statistics and appropriate plots for the training data set. Provide a scatterplot matrix for at least two of the independent variables and the dependent variable. Derive a correlation matrix for any THREE quantitative variables in the dataset. Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide a 80% confidence interval. Discuss the meaning of your analysis. Would you be worried about familywise error? Why or why not?
train<-read.csv("https://raw.githubusercontent.com/VioletaStoyanova/Data605/master/train.csv", stringsAsFactors = FALSE)
head(train)
## Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape
## 1 1 60 RL 65 8450 Pave <NA> Reg
## 2 2 20 RL 80 9600 Pave <NA> Reg
## 3 3 60 RL 68 11250 Pave <NA> IR1
## 4 4 70 RL 60 9550 Pave <NA> IR1
## 5 5 60 RL 84 14260 Pave <NA> IR1
## 6 6 50 RL 85 14115 Pave <NA> IR1
## LandContour Utilities LotConfig LandSlope Neighborhood Condition1
## 1 Lvl AllPub Inside Gtl CollgCr Norm
## 2 Lvl AllPub FR2 Gtl Veenker Feedr
## 3 Lvl AllPub Inside Gtl CollgCr Norm
## 4 Lvl AllPub Corner Gtl Crawfor Norm
## 5 Lvl AllPub FR2 Gtl NoRidge Norm
## 6 Lvl AllPub Inside Gtl Mitchel Norm
## Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt
## 1 Norm 1Fam 2Story 7 5 2003
## 2 Norm 1Fam 1Story 6 8 1976
## 3 Norm 1Fam 2Story 7 5 2001
## 4 Norm 1Fam 2Story 7 5 1915
## 5 Norm 1Fam 2Story 8 5 2000
## 6 Norm 1Fam 1.5Fin 5 5 1993
## YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType
## 1 2003 Gable CompShg VinylSd VinylSd BrkFace
## 2 1976 Gable CompShg MetalSd MetalSd None
## 3 2002 Gable CompShg VinylSd VinylSd BrkFace
## 4 1970 Gable CompShg Wd Sdng Wd Shng None
## 5 2000 Gable CompShg VinylSd VinylSd BrkFace
## 6 1995 Gable CompShg VinylSd VinylSd None
## MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure
## 1 196 Gd TA PConc Gd TA No
## 2 0 TA TA CBlock Gd TA Gd
## 3 162 Gd TA PConc Gd TA Mn
## 4 0 TA TA BrkTil TA Gd No
## 5 350 Gd TA PConc Gd TA Av
## 6 0 TA TA Wood Gd TA No
## BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF
## 1 GLQ 706 Unf 0 150 856
## 2 ALQ 978 Unf 0 284 1262
## 3 GLQ 486 Unf 0 434 920
## 4 ALQ 216 Unf 0 540 756
## 5 GLQ 655 Unf 0 490 1145
## 6 GLQ 732 Unf 0 64 796
## Heating HeatingQC CentralAir Electrical X1stFlrSF X2ndFlrSF LowQualFinSF
## 1 GasA Ex Y SBrkr 856 854 0
## 2 GasA Ex Y SBrkr 1262 0 0
## 3 GasA Ex Y SBrkr 920 866 0
## 4 GasA Gd Y SBrkr 961 756 0
## 5 GasA Ex Y SBrkr 1145 1053 0
## 6 GasA Ex Y SBrkr 796 566 0
## GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr
## 1 1710 1 0 2 1 3
## 2 1262 0 1 2 0 3
## 3 1786 1 0 2 1 3
## 4 1717 1 0 1 0 3
## 5 2198 1 0 2 1 4
## 6 1362 1 0 1 1 1
## KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu
## 1 1 Gd 8 Typ 0 <NA>
## 2 1 TA 6 Typ 1 TA
## 3 1 Gd 6 Typ 1 TA
## 4 1 Gd 7 Typ 1 Gd
## 5 1 Gd 9 Typ 1 TA
## 6 1 TA 5 Typ 0 <NA>
## GarageType GarageYrBlt GarageFinish GarageCars GarageArea GarageQual
## 1 Attchd 2003 RFn 2 548 TA
## 2 Attchd 1976 RFn 2 460 TA
## 3 Attchd 2001 RFn 2 608 TA
## 4 Detchd 1998 Unf 3 642 TA
## 5 Attchd 2000 RFn 3 836 TA
## 6 Attchd 1993 Unf 2 480 TA
## GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch X3SsnPorch
## 1 TA Y 0 61 0 0
## 2 TA Y 298 0 0 0
## 3 TA Y 0 42 0 0
## 4 TA Y 0 35 272 0
## 5 TA Y 192 84 0 0
## 6 TA Y 40 30 0 320
## ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold
## 1 0 0 <NA> <NA> <NA> 0 2 2008
## 2 0 0 <NA> <NA> <NA> 0 5 2007
## 3 0 0 <NA> <NA> <NA> 0 9 2008
## 4 0 0 <NA> <NA> <NA> 0 2 2006
## 5 0 0 <NA> <NA> <NA> 0 12 2008
## 6 0 0 <NA> MnPrv Shed 700 10 2009
## SaleType SaleCondition SalePrice
## 1 WD Normal 208500
## 2 WD Normal 181500
## 3 WD Normal 223500
## 4 WD Abnorml 140000
## 5 WD Normal 250000
## 6 WD Normal 143000
summary(train)
## Id MSSubClass MSZoning LotFrontage
## Min. : 1.0 Min. : 20.0 Length:1460 Min. : 21.00
## 1st Qu.: 365.8 1st Qu.: 20.0 Class :character 1st Qu.: 59.00
## Median : 730.5 Median : 50.0 Mode :character Median : 69.00
## Mean : 730.5 Mean : 56.9 Mean : 70.05
## 3rd Qu.:1095.2 3rd Qu.: 70.0 3rd Qu.: 80.00
## Max. :1460.0 Max. :190.0 Max. :313.00
## NA's :259
## LotArea Street Alley LotShape
## Min. : 1300 Length:1460 Length:1460 Length:1460
## 1st Qu.: 7554 Class :character Class :character Class :character
## Median : 9478 Mode :character Mode :character Mode :character
## Mean : 10517
## 3rd Qu.: 11602
## Max. :215245
##
## LandContour Utilities LotConfig
## Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## LandSlope Neighborhood Condition1
## Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## Condition2 BldgType HouseStyle OverallQual
## Length:1460 Length:1460 Length:1460 Min. : 1.000
## Class :character Class :character Class :character 1st Qu.: 5.000
## Mode :character Mode :character Mode :character Median : 6.000
## Mean : 6.099
## 3rd Qu.: 7.000
## Max. :10.000
##
## OverallCond YearBuilt YearRemodAdd RoofStyle
## Min. :1.000 Min. :1872 Min. :1950 Length:1460
## 1st Qu.:5.000 1st Qu.:1954 1st Qu.:1967 Class :character
## Median :5.000 Median :1973 Median :1994 Mode :character
## Mean :5.575 Mean :1971 Mean :1985
## 3rd Qu.:6.000 3rd Qu.:2000 3rd Qu.:2004
## Max. :9.000 Max. :2010 Max. :2010
##
## RoofMatl Exterior1st Exterior2nd
## Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## MasVnrType MasVnrArea ExterQual ExterCond
## Length:1460 Min. : 0.0 Length:1460 Length:1460
## Class :character 1st Qu.: 0.0 Class :character Class :character
## Mode :character Median : 0.0 Mode :character Mode :character
## Mean : 103.7
## 3rd Qu.: 166.0
## Max. :1600.0
## NA's :8
## Foundation BsmtQual BsmtCond
## Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2
## Length:1460 Length:1460 Min. : 0.0 Length:1460
## Class :character Class :character 1st Qu.: 0.0 Class :character
## Mode :character Mode :character Median : 383.5 Mode :character
## Mean : 443.6
## 3rd Qu.: 712.2
## Max. :5644.0
##
## BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating
## Min. : 0.00 Min. : 0.0 Min. : 0.0 Length:1460
## 1st Qu.: 0.00 1st Qu.: 223.0 1st Qu.: 795.8 Class :character
## Median : 0.00 Median : 477.5 Median : 991.5 Mode :character
## Mean : 46.55 Mean : 567.2 Mean :1057.4
## 3rd Qu.: 0.00 3rd Qu.: 808.0 3rd Qu.:1298.2
## Max. :1474.00 Max. :2336.0 Max. :6110.0
##
## HeatingQC CentralAir Electrical X1stFlrSF
## Length:1460 Length:1460 Length:1460 Min. : 334
## Class :character Class :character Class :character 1st Qu.: 882
## Mode :character Mode :character Mode :character Median :1087
## Mean :1163
## 3rd Qu.:1391
## Max. :4692
##
## X2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath
## Min. : 0 Min. : 0.000 Min. : 334 Min. :0.0000
## 1st Qu.: 0 1st Qu.: 0.000 1st Qu.:1130 1st Qu.:0.0000
## Median : 0 Median : 0.000 Median :1464 Median :0.0000
## Mean : 347 Mean : 5.845 Mean :1515 Mean :0.4253
## 3rd Qu.: 728 3rd Qu.: 0.000 3rd Qu.:1777 3rd Qu.:1.0000
## Max. :2065 Max. :572.000 Max. :5642 Max. :3.0000
##
## BsmtHalfBath FullBath HalfBath BedroomAbvGr
## Min. :0.00000 Min. :0.000 Min. :0.0000 Min. :0.000
## 1st Qu.:0.00000 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.:2.000
## Median :0.00000 Median :2.000 Median :0.0000 Median :3.000
## Mean :0.05753 Mean :1.565 Mean :0.3829 Mean :2.866
## 3rd Qu.:0.00000 3rd Qu.:2.000 3rd Qu.:1.0000 3rd Qu.:3.000
## Max. :2.00000 Max. :3.000 Max. :2.0000 Max. :8.000
##
## KitchenAbvGr KitchenQual TotRmsAbvGrd Functional
## Min. :0.000 Length:1460 Min. : 2.000 Length:1460
## 1st Qu.:1.000 Class :character 1st Qu.: 5.000 Class :character
## Median :1.000 Mode :character Median : 6.000 Mode :character
## Mean :1.047 Mean : 6.518
## 3rd Qu.:1.000 3rd Qu.: 7.000
## Max. :3.000 Max. :14.000
##
## Fireplaces FireplaceQu GarageType GarageYrBlt
## Min. :0.000 Length:1460 Length:1460 Min. :1900
## 1st Qu.:0.000 Class :character Class :character 1st Qu.:1961
## Median :1.000 Mode :character Mode :character Median :1980
## Mean :0.613 Mean :1979
## 3rd Qu.:1.000 3rd Qu.:2002
## Max. :3.000 Max. :2010
## NA's :81
## GarageFinish GarageCars GarageArea GarageQual
## Length:1460 Min. :0.000 Min. : 0.0 Length:1460
## Class :character 1st Qu.:1.000 1st Qu.: 334.5 Class :character
## Mode :character Median :2.000 Median : 480.0 Mode :character
## Mean :1.767 Mean : 473.0
## 3rd Qu.:2.000 3rd Qu.: 576.0
## Max. :4.000 Max. :1418.0
##
## GarageCond PavedDrive WoodDeckSF OpenPorchSF
## Length:1460 Length:1460 Min. : 0.00 Min. : 0.00
## Class :character Class :character 1st Qu.: 0.00 1st Qu.: 0.00
## Mode :character Mode :character Median : 0.00 Median : 25.00
## Mean : 94.24 Mean : 46.66
## 3rd Qu.:168.00 3rd Qu.: 68.00
## Max. :857.00 Max. :547.00
##
## EnclosedPorch X3SsnPorch ScreenPorch PoolArea
## Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : 0.000
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.000
## Median : 0.00 Median : 0.00 Median : 0.00 Median : 0.000
## Mean : 21.95 Mean : 3.41 Mean : 15.06 Mean : 2.759
## 3rd Qu.: 0.00 3rd Qu.: 0.00 3rd Qu.: 0.00 3rd Qu.: 0.000
## Max. :552.00 Max. :508.00 Max. :480.00 Max. :738.000
##
## PoolQC Fence MiscFeature
## Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## MiscVal MoSold YrSold SaleType
## Min. : 0.00 Min. : 1.000 Min. :2006 Length:1460
## 1st Qu.: 0.00 1st Qu.: 5.000 1st Qu.:2007 Class :character
## Median : 0.00 Median : 6.000 Median :2008 Mode :character
## Mean : 43.49 Mean : 6.322 Mean :2008
## 3rd Qu.: 0.00 3rd Qu.: 8.000 3rd Qu.:2009
## Max. :15500.00 Max. :12.000 Max. :2010
##
## SaleCondition SalePrice
## Length:1460 Min. : 34900
## Class :character 1st Qu.:129975
## Mode :character Median :163000
## Mean :180921
## 3rd Qu.:214000
## Max. :755000
##
par(mfrow=c(2, 2))
hist(train$BedroomAbvGr, col = "red")
boxplot(train$BedroomAbvGr, main="Boxplot LotArea")
qqnorm(train$BedroomAbvGr)
qqline(train$BedroomAbvGr)
par(mfrow=c(2, 2))
From the plots one can see that the data is skewed to the right
par(mfrow=c(2, 2))
hist(train$GarageArea, col = "blue")
boxplot(train$GarageArea, main="Boxplot LotArea")
qqnorm(train$GarageArea)
qqline(train$GarageArea)
par(mfrow=c(2, 2))
hist(train$SalePrice, col = "green")
boxplot(train$SalePrice, main="Boxplot LotArea")
qqnorm(train$SalePrice)
qqline(train$SalePrice)
plot(train$BedroomAbvGr, train$SalePrice, main = "Scatterplot SalePrice by BedroomAbvGr ")
abline(lm(train$SalePrice ~ train$BedroomAbvGr), col="red", lwd=3)
plot(train$GarageArea,train$SalePrice, main = "Scatterplot SalePrice by GarageArea ")
abline(lm(train$SalePrice ~ train$GarageArea), col="red", lwd=3)
Craeting a Multiple linear regession for the 2 independent variables
fit<-lm(SalePrice ~ BedroomAbvGr + GarageArea,data = train)
summary(fit)
##
## Call:
## lm(formula = SalePrice ~ BedroomAbvGr + GarageArea, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -266238 -33001 -4397 23587 477890
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 37074.363 6676.300 5.553 3.33e-08 ***
## BedroomAbvGr 12472.557 1972.131 6.324 3.37e-10 ***
## GarageArea 228.540 7.525 30.372 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 61320 on 1457 degrees of freedom
## Multiple R-squared: 0.405, Adjusted R-squared: 0.4042
## F-statistic: 495.9 on 2 and 1457 DF, p-value: < 2.2e-16
plot(fit)
cor.test(train$BedroomAbvGr, train$SalePrice, method = "pearson" , conf.level = 0.80)
##
## Pearson's product-moment correlation
##
## data: train$BedroomAbvGr and train$SalePrice
## t = 6.5159, df = 1458, p-value = 9.927e-11
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.1354160 0.2006421
## sample estimates:
## cor
## 0.1682132
cor.test(train$GarageArea, train$SalePrice, method = "pearson" , conf.level = 0.80)
##
## Pearson's product-moment correlation
##
## data: train$GarageArea and train$SalePrice
## t = 30.446, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.6024756 0.6435283
## sample estimates:
## cor
## 0.6234314
cor.test(train$GarageCars, train$SalePrice, method = "pearson" , conf.level = 0.80)
##
## Pearson's product-moment correlation
##
## data: train$GarageCars and train$SalePrice
## t = 31.839, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.6201771 0.6597899
## sample estimates:
## cor
## 0.6404092
One can observe that there is relatively high positive correlation between GarageCars, GarageArea and PriceSale but there is a weak correlation between BedroomAbvGr and PriceSale
Linear Algebra and Correlation. Invert your 3 x 3 correlation matrix from above. (This is known as the precision matrix and contains variance inflation factors on the diagonal.) Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix. Conduct LU decomposition on the matrix.
cor_data <- cor(train[,c("BedroomAbvGr","GarageArea","GarageCars")])
cor_data
## BedroomAbvGr GarageArea GarageCars
## BedroomAbvGr 1.00000000 0.06525253 0.08610644
## GarageArea 0.06525253 1.00000000 0.88247541
## GarageCars 0.08610644 0.88247541 1.00000000
dim(cor_data)
## [1] 3 3
#precision matrix
pre_data <- solve(cor_data)
pre_data
## BedroomAbvGr GarageArea GarageCars
## BedroomAbvGr 1.00799861 0.04890745 -0.1299548
## GarageArea 0.04890745 4.52240963 -3.9951266
## GarageCars -0.12995479 -3.99512656 4.5367909
cor_data %*% pre_data
## BedroomAbvGr GarageArea GarageCars
## BedroomAbvGr 1 0.000000e+00 0
## GarageArea 0 1.000000e+00 0
## GarageCars 0 -8.881784e-16 1
pre_data %*% cor_data
## BedroomAbvGr GarageArea GarageCars
## BedroomAbvGr 1.000000e+00 2.775558e-17 2.775558e-17
## GarageArea -5.551115e-17 1.000000e+00 -1.332268e-15
## GarageCars 5.551115e-17 0.000000e+00 1.000000e+00
lu.decomposition(pre_data)
## $L
## [,1] [,2] [,3]
## [1,] 1.00000000 0.0000000 0
## [2,] 0.04851936 1.0000000 0
## [3,] -0.12892358 -0.8824754 1
##
## $U
## [,1] [,2] [,3]
## [1,] 1.007999 0.04890745 -0.1299548
## [2,] 0.000000 4.52003667 -3.9888212
## [3,] 0.000000 0.00000000 1.0000000
Calculus-Based Probability & Statistics. Many times, it makes sense to fit a closed form distribution to data. Select a variable in the Kaggle.com training dataset that is skewed to the right, shift it so that the minimum value is absolutely above zero if necessary. Then load the MASS package and run fitdistr to fit an exponential probability density function. (See https://stat.ethz.ch/R-manual/R-devel/library/MASS/html/fitdistr.html ). Find the optimal value of ??? for this distribution, and then take 1000 samples from this exponential distribution using this value (e.g., rexp(1000, ???)). Plot a histogram and compare it with a histogram of your original variable. Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function (CDF). Also generate a 95% confidence interval from the empirical data, assuming normality. Finally, provide the empirical 5th percentile and 95th percentile of the data. Discuss.
#shift and find minimum value of chosen variable
BedroomAbvGr <- train$BedroomAbvGr + 1e-32
min(BedroomAbvGr)
## [1] 1e-32
(fit <- fitdistr(BedroomAbvGr, "exponential"))
## rate
## 0.348864994
## (0.009130214)
(lambda <- fit$estimate)
## rate
## 0.348865
samp <- rexp(1000, lambda)
par(mfrow=c(1, 2))
hist(samp, xlab = "BedroomAbvGr", main = "Simulated")
hist(train$BedroomAbvGr, xlab = "BedroomAbvGr", main = "Observed")
The Simulated Data is heavily skewed to the right where the Observed is more to the center.
(ecdf<-ecdf(samp))
## Empirical CDF
## Call: ecdf(samp)
## x[1:1000] = 0.0035525, 0.0057715, 0.0058203, ..., 20.378, 21.837
quantile(samp,probs=seq(0,1,.05))
## 0% 5% 10% 15% 20%
## 0.003552548 0.129693509 0.276155837 0.421016613 0.541419284
## 25% 30% 35% 40% 45%
## 0.722549328 0.878742289 1.076093290 1.299830970 1.606809266
## 50% 55% 60% 65% 70%
## 1.880711609 2.191986108 2.505532875 2.882929179 3.314130632
## 75% 80% 85% 90% 95%
## 3.747102636 4.318757937 5.227683710 6.503420020 8.386185133
## 100%
## 21.837246380
Modeling. Build some type of multiple regression model and submit your model to the competition board. Provide your complete model summary and results with analysis. Report your Kaggle.com user name and score.
I will build a multiple linear regression with variables that have higher than .5 percent correlation with the dependent variable SalePrice
#creating a dataframe with numeric variables
quantVar <- sapply(train, is.numeric)
quantVar_df <- train[ , quantVar]
head(quantVar_df)
## Id MSSubClass LotFrontage LotArea OverallQual OverallCond YearBuilt
## 1 1 60 65 8450 7 5 2003
## 2 2 20 80 9600 6 8 1976
## 3 3 60 68 11250 7 5 2001
## 4 4 70 60 9550 7 5 1915
## 5 5 60 84 14260 8 5 2000
## 6 6 50 85 14115 5 5 1993
## YearRemodAdd MasVnrArea BsmtFinSF1 BsmtFinSF2 BsmtUnfSF TotalBsmtSF
## 1 2003 196 706 0 150 856
## 2 1976 0 978 0 284 1262
## 3 2002 162 486 0 434 920
## 4 1970 0 216 0 540 756
## 5 2000 350 655 0 490 1145
## 6 1995 0 732 0 64 796
## X1stFlrSF X2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath
## 1 856 854 0 1710 1 0
## 2 1262 0 0 1262 0 1
## 3 920 866 0 1786 1 0
## 4 961 756 0 1717 1 0
## 5 1145 1053 0 2198 1 0
## 6 796 566 0 1362 1 0
## FullBath HalfBath BedroomAbvGr KitchenAbvGr TotRmsAbvGrd Fireplaces
## 1 2 1 3 1 8 0
## 2 2 0 3 1 6 1
## 3 2 1 3 1 6 1
## 4 1 0 3 1 7 1
## 5 2 1 4 1 9 1
## 6 1 1 1 1 5 0
## GarageYrBlt GarageCars GarageArea WoodDeckSF OpenPorchSF EnclosedPorch
## 1 2003 2 548 0 61 0
## 2 1976 2 460 298 0 0
## 3 2001 2 608 0 42 0
## 4 1998 3 642 0 35 272
## 5 2000 3 836 192 84 0
## 6 1993 2 480 40 30 0
## X3SsnPorch ScreenPorch PoolArea MiscVal MoSold YrSold SalePrice
## 1 0 0 0 0 2 2008 208500
## 2 0 0 0 0 5 2007 181500
## 3 0 0 0 0 9 2008 223500
## 4 0 0 0 0 2 2006 140000
## 5 0 0 0 0 12 2008 250000
## 6 320 0 0 700 10 2009 143000
#the next step is to find the correltion between the numeric variables and SalePrice and then choose the variables that have higher than .5 correlation
corSales <-data.frame(apply(quantVar_df,2, function(col)cor(col, quantVar_df$SalePrice, use = "complete.obs")))
colnames(corSales) <- c("cor")
corSales
## cor
## Id -0.02191672
## MSSubClass -0.08428414
## LotFrontage 0.35179910
## LotArea 0.26384335
## OverallQual 0.79098160
## OverallCond -0.07785589
## YearBuilt 0.52289733
## YearRemodAdd 0.50710097
## MasVnrArea 0.47749305
## BsmtFinSF1 0.38641981
## BsmtFinSF2 -0.01137812
## BsmtUnfSF 0.21447911
## TotalBsmtSF 0.61358055
## X1stFlrSF 0.60585218
## X2ndFlrSF 0.31933380
## LowQualFinSF -0.02560613
## GrLivArea 0.70862448
## BsmtFullBath 0.22712223
## BsmtHalfBath -0.01684415
## FullBath 0.56066376
## HalfBath 0.28410768
## BedroomAbvGr 0.16821315
## KitchenAbvGr -0.13590737
## TotRmsAbvGrd 0.53372316
## Fireplaces 0.46692884
## GarageYrBlt 0.48636168
## GarageCars 0.64040920
## GarageArea 0.62343144
## WoodDeckSF 0.32441344
## OpenPorchSF 0.31585623
## EnclosedPorch -0.12857796
## X3SsnPorch 0.04458367
## ScreenPorch 0.11144657
## PoolArea 0.09240355
## MiscVal -0.02118958
## MoSold 0.04643225
## YrSold -0.02892259
## SalePrice 1.00000000
(subset(corSales, cor > 0.5))
## cor
## OverallQual 0.7909816
## YearBuilt 0.5228973
## YearRemodAdd 0.5071010
## TotalBsmtSF 0.6135806
## X1stFlrSF 0.6058522
## GrLivArea 0.7086245
## FullBath 0.5606638
## TotRmsAbvGrd 0.5337232
## GarageCars 0.6404092
## GarageArea 0.6234314
## SalePrice 1.0000000
model <- lm(SalePrice ~ OverallQual + YearBuilt + YearRemodAdd + TotalBsmtSF + X1stFlrSF + GrLivArea + FullBath + TotRmsAbvGrd + GarageCars + GarageArea, data =train)
summary(model)
##
## Call:
## lm(formula = SalePrice ~ OverallQual + YearBuilt + YearRemodAdd +
## TotalBsmtSF + X1stFlrSF + GrLivArea + FullBath + TotRmsAbvGrd +
## GarageCars + GarageArea, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -489958 -19316 -1948 16020 290558
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.186e+06 1.291e+05 -9.187 < 2e-16 ***
## OverallQual 1.960e+04 1.190e+03 16.472 < 2e-16 ***
## YearBuilt 2.682e+02 5.035e+01 5.328 1.15e-07 ***
## YearRemodAdd 2.965e+02 6.363e+01 4.659 3.47e-06 ***
## TotalBsmtSF 1.986e+01 4.295e+00 4.625 4.09e-06 ***
## X1stFlrSF 1.417e+01 4.930e+00 2.875 0.004097 **
## GrLivArea 5.130e+01 4.233e+00 12.119 < 2e-16 ***
## FullBath -6.791e+03 2.682e+03 -2.532 0.011457 *
## TotRmsAbvGrd 3.310e+01 1.119e+03 0.030 0.976404
## GarageCars 1.042e+04 3.044e+03 3.422 0.000639 ***
## GarageArea 1.495e+01 1.031e+01 1.450 0.147384
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 37920 on 1449 degrees of freedom
## Multiple R-squared: 0.7737, Adjusted R-squared: 0.7721
## F-statistic: 495.4 on 10 and 1449 DF, p-value: < 2.2e-16
The R^2 is 0.7736 where 77.37% of the variance can be expalined by the model
test<-read.csv("https://raw.githubusercontent.com/VioletaStoyanova/Data605/master/test.csv", stringsAsFactors = FALSE)
head(test)
## Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape
## 1 1461 20 RH 80 11622 Pave <NA> Reg
## 2 1462 20 RL 81 14267 Pave <NA> IR1
## 3 1463 60 RL 74 13830 Pave <NA> IR1
## 4 1464 60 RL 78 9978 Pave <NA> IR1
## 5 1465 120 RL 43 5005 Pave <NA> IR1
## 6 1466 60 RL 75 10000 Pave <NA> IR1
## LandContour Utilities LotConfig LandSlope Neighborhood Condition1
## 1 Lvl AllPub Inside Gtl NAmes Feedr
## 2 Lvl AllPub Corner Gtl NAmes Norm
## 3 Lvl AllPub Inside Gtl Gilbert Norm
## 4 Lvl AllPub Inside Gtl Gilbert Norm
## 5 HLS AllPub Inside Gtl StoneBr Norm
## 6 Lvl AllPub Corner Gtl Gilbert Norm
## Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt
## 1 Norm 1Fam 1Story 5 6 1961
## 2 Norm 1Fam 1Story 6 6 1958
## 3 Norm 1Fam 2Story 5 5 1997
## 4 Norm 1Fam 2Story 6 6 1998
## 5 Norm TwnhsE 1Story 8 5 1992
## 6 Norm 1Fam 2Story 6 5 1993
## YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType
## 1 1961 Gable CompShg VinylSd VinylSd None
## 2 1958 Hip CompShg Wd Sdng Wd Sdng BrkFace
## 3 1998 Gable CompShg VinylSd VinylSd None
## 4 1998 Gable CompShg VinylSd VinylSd BrkFace
## 5 1992 Gable CompShg HdBoard HdBoard None
## 6 1994 Gable CompShg HdBoard HdBoard None
## MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure
## 1 0 TA TA CBlock TA TA No
## 2 108 TA TA CBlock TA TA No
## 3 0 TA TA PConc Gd TA No
## 4 20 TA TA PConc TA TA No
## 5 0 Gd TA PConc Gd TA No
## 6 0 TA TA PConc Gd TA No
## BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF
## 1 Rec 468 LwQ 144 270 882
## 2 ALQ 923 Unf 0 406 1329
## 3 GLQ 791 Unf 0 137 928
## 4 GLQ 602 Unf 0 324 926
## 5 ALQ 263 Unf 0 1017 1280
## 6 Unf 0 Unf 0 763 763
## Heating HeatingQC CentralAir Electrical X1stFlrSF X2ndFlrSF LowQualFinSF
## 1 GasA TA Y SBrkr 896 0 0
## 2 GasA TA Y SBrkr 1329 0 0
## 3 GasA Gd Y SBrkr 928 701 0
## 4 GasA Ex Y SBrkr 926 678 0
## 5 GasA Ex Y SBrkr 1280 0 0
## 6 GasA Gd Y SBrkr 763 892 0
## GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr
## 1 896 0 0 1 0 2
## 2 1329 0 0 1 1 3
## 3 1629 0 0 2 1 3
## 4 1604 0 0 2 1 3
## 5 1280 0 0 2 0 2
## 6 1655 0 0 2 1 3
## KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu
## 1 1 TA 5 Typ 0 <NA>
## 2 1 Gd 6 Typ 0 <NA>
## 3 1 TA 6 Typ 1 TA
## 4 1 Gd 7 Typ 1 Gd
## 5 1 Gd 5 Typ 0 <NA>
## 6 1 TA 7 Typ 1 TA
## GarageType GarageYrBlt GarageFinish GarageCars GarageArea GarageQual
## 1 Attchd 1961 Unf 1 730 TA
## 2 Attchd 1958 Unf 1 312 TA
## 3 Attchd 1997 Fin 2 482 TA
## 4 Attchd 1998 Fin 2 470 TA
## 5 Attchd 1992 RFn 2 506 TA
## 6 Attchd 1993 Fin 2 440 TA
## GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch X3SsnPorch
## 1 TA Y 140 0 0 0
## 2 TA Y 393 36 0 0
## 3 TA Y 212 34 0 0
## 4 TA Y 360 36 0 0
## 5 TA Y 0 82 0 0
## 6 TA Y 157 84 0 0
## ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold
## 1 120 0 <NA> MnPrv <NA> 0 6 2010
## 2 0 0 <NA> <NA> Gar2 12500 6 2010
## 3 0 0 <NA> MnPrv <NA> 0 3 2010
## 4 0 0 <NA> <NA> <NA> 0 6 2010
## 5 144 0 <NA> <NA> <NA> 0 1 2010
## 6 0 0 <NA> <NA> <NA> 0 4 2010
## SaleType SaleCondition
## 1 WD Normal
## 2 WD Normal
## 3 WD Normal
## 4 WD Normal
## 5 WD Normal
## 6 WD Normal
mySalePrice <- predict(model,test)
#create dataframe
pricepred<- data.frame( Id = test[,"Id"], SalePrice = mySalePrice)
pricepred[pricepred<0] <- 0
pricepred<- replace(pricepred,is.na(pricepred),0)
head(pricepred)
## Id SalePrice
## 1 1461 110135.9
## 2 1462 159060.0
## 3 1463 169683.7
## 4 1464 188059.7
## 5 1465 219782.0
## 6 1466 182152.0
##write .csv for submission
write.csv(pricepred, file="pricepred.csv", row.names = FALSE)
My Kaggle score was 0.85356 and my name is violetastoyanova