#libraries
library(tidyr)
library(dplyr)
library(pander)
library(ggplot2)
library(corrplot)
library(matlib)
library(matrixcalc)
library(MASS)
library(mltools)
library(caTools)
Using R, generate a random variable X that has 10,000 random uniform numbers from 1 to N, where N can be any number of your choosing greater than or equal to 6. Then generate a random variable Y that has 10,000 random normal numbers with a mean of u=o=(N+1)/2.
Calculate as a minimum the below probabilities a through c. Assume the small letter “x; is estimated as the median of the X variable, and the small letter”y; is estimated as the 1st quartile of the Y variable. Interpret the meaning of all probabilities.
n <- 3
X <- runif(10000, min = 1, max = n)
Y <- rnorm(10000, mean = (n+1)/2, sd = (n+1)/2)
summary(X)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.497 2.002 2.000 2.503 3.000
x <- mean(X)
y <- summary(X)[2]
pA <- length(X[X > x])
pB <- length(X[X > y])
((pA / length(X)) * (pB * length(X))) / (pB * length(X))
## [1] 0.5005
pA <- length(X[X > x])
pB <- length(X[Y > y])
((pA / length(X)) * (pB * length(X))) / (pB * length(X))
## [1] 0.5005
pA <- length(X[X < x])
pB <- length(X[X > y])
((pA / length(X)) * (pB * length(X))) / (pB * length(X))
## [1] 0.4995
train <- read.csv("train.csv")
test <- read.csv("test.csv")
summary(train)
## Id MSSubClass MSZoning LotFrontage
## Min. : 1.0 Min. : 20.0 Length:1460 Min. : 21.00
## 1st Qu.: 365.8 1st Qu.: 20.0 Class :character 1st Qu.: 59.00
## Median : 730.5 Median : 50.0 Mode :character Median : 69.00
## Mean : 730.5 Mean : 56.9 Mean : 70.05
## 3rd Qu.:1095.2 3rd Qu.: 70.0 3rd Qu.: 80.00
## Max. :1460.0 Max. :190.0 Max. :313.00
## NA's :259
## LotArea Street Alley LotShape
## Min. : 1300 Length:1460 Length:1460 Length:1460
## 1st Qu.: 7554 Class :character Class :character Class :character
## Median : 9478 Mode :character Mode :character Mode :character
## Mean : 10517
## 3rd Qu.: 11602
## Max. :215245
##
## LandContour Utilities LotConfig LandSlope
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Neighborhood Condition1 Condition2 BldgType
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## HouseStyle OverallQual OverallCond YearBuilt
## Length:1460 Min. : 1.000 Min. :1.000 Min. :1872
## Class :character 1st Qu.: 5.000 1st Qu.:5.000 1st Qu.:1954
## Mode :character Median : 6.000 Median :5.000 Median :1973
## Mean : 6.099 Mean :5.575 Mean :1971
## 3rd Qu.: 7.000 3rd Qu.:6.000 3rd Qu.:2000
## Max. :10.000 Max. :9.000 Max. :2010
##
## YearRemodAdd RoofStyle RoofMatl Exterior1st
## Min. :1950 Length:1460 Length:1460 Length:1460
## 1st Qu.:1967 Class :character Class :character Class :character
## Median :1994 Mode :character Mode :character Mode :character
## Mean :1985
## 3rd Qu.:2004
## Max. :2010
##
## Exterior2nd MasVnrType MasVnrArea ExterQual
## Length:1460 Length:1460 Min. : 0.0 Length:1460
## Class :character Class :character 1st Qu.: 0.0 Class :character
## Mode :character Mode :character Median : 0.0 Mode :character
## Mean : 103.7
## 3rd Qu.: 166.0
## Max. :1600.0
## NA's :8
## ExterCond Foundation BsmtQual BsmtCond
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2
## Length:1460 Length:1460 Min. : 0.0 Length:1460
## Class :character Class :character 1st Qu.: 0.0 Class :character
## Mode :character Mode :character Median : 383.5 Mode :character
## Mean : 443.6
## 3rd Qu.: 712.2
## Max. :5644.0
##
## BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating
## Min. : 0.00 Min. : 0.0 Min. : 0.0 Length:1460
## 1st Qu.: 0.00 1st Qu.: 223.0 1st Qu.: 795.8 Class :character
## Median : 0.00 Median : 477.5 Median : 991.5 Mode :character
## Mean : 46.55 Mean : 567.2 Mean :1057.4
## 3rd Qu.: 0.00 3rd Qu.: 808.0 3rd Qu.:1298.2
## Max. :1474.00 Max. :2336.0 Max. :6110.0
##
## HeatingQC CentralAir Electrical X1stFlrSF
## Length:1460 Length:1460 Length:1460 Min. : 334
## Class :character Class :character Class :character 1st Qu.: 882
## Mode :character Mode :character Mode :character Median :1087
## Mean :1163
## 3rd Qu.:1391
## Max. :4692
##
## X2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath
## Min. : 0 Min. : 0.000 Min. : 334 Min. :0.0000
## 1st Qu.: 0 1st Qu.: 0.000 1st Qu.:1130 1st Qu.:0.0000
## Median : 0 Median : 0.000 Median :1464 Median :0.0000
## Mean : 347 Mean : 5.845 Mean :1515 Mean :0.4253
## 3rd Qu.: 728 3rd Qu.: 0.000 3rd Qu.:1777 3rd Qu.:1.0000
## Max. :2065 Max. :572.000 Max. :5642 Max. :3.0000
##
## BsmtHalfBath FullBath HalfBath BedroomAbvGr
## Min. :0.00000 Min. :0.000 Min. :0.0000 Min. :0.000
## 1st Qu.:0.00000 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.:2.000
## Median :0.00000 Median :2.000 Median :0.0000 Median :3.000
## Mean :0.05753 Mean :1.565 Mean :0.3829 Mean :2.866
## 3rd Qu.:0.00000 3rd Qu.:2.000 3rd Qu.:1.0000 3rd Qu.:3.000
## Max. :2.00000 Max. :3.000 Max. :2.0000 Max. :8.000
##
## KitchenAbvGr KitchenQual TotRmsAbvGrd Functional
## Min. :0.000 Length:1460 Min. : 2.000 Length:1460
## 1st Qu.:1.000 Class :character 1st Qu.: 5.000 Class :character
## Median :1.000 Mode :character Median : 6.000 Mode :character
## Mean :1.047 Mean : 6.518
## 3rd Qu.:1.000 3rd Qu.: 7.000
## Max. :3.000 Max. :14.000
##
## Fireplaces FireplaceQu GarageType GarageYrBlt
## Min. :0.000 Length:1460 Length:1460 Min. :1900
## 1st Qu.:0.000 Class :character Class :character 1st Qu.:1961
## Median :1.000 Mode :character Mode :character Median :1980
## Mean :0.613 Mean :1979
## 3rd Qu.:1.000 3rd Qu.:2002
## Max. :3.000 Max. :2010
## NA's :81
## GarageFinish GarageCars GarageArea GarageQual
## Length:1460 Min. :0.000 Min. : 0.0 Length:1460
## Class :character 1st Qu.:1.000 1st Qu.: 334.5 Class :character
## Mode :character Median :2.000 Median : 480.0 Mode :character
## Mean :1.767 Mean : 473.0
## 3rd Qu.:2.000 3rd Qu.: 576.0
## Max. :4.000 Max. :1418.0
##
## GarageCond PavedDrive WoodDeckSF OpenPorchSF
## Length:1460 Length:1460 Min. : 0.00 Min. : 0.00
## Class :character Class :character 1st Qu.: 0.00 1st Qu.: 0.00
## Mode :character Mode :character Median : 0.00 Median : 25.00
## Mean : 94.24 Mean : 46.66
## 3rd Qu.:168.00 3rd Qu.: 68.00
## Max. :857.00 Max. :547.00
##
## EnclosedPorch X3SsnPorch ScreenPorch PoolArea
## Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : 0.000
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.000
## Median : 0.00 Median : 0.00 Median : 0.00 Median : 0.000
## Mean : 21.95 Mean : 3.41 Mean : 15.06 Mean : 2.759
## 3rd Qu.: 0.00 3rd Qu.: 0.00 3rd Qu.: 0.00 3rd Qu.: 0.000
## Max. :552.00 Max. :508.00 Max. :480.00 Max. :738.000
##
## PoolQC Fence MiscFeature MiscVal
## Length:1460 Length:1460 Length:1460 Min. : 0.00
## Class :character Class :character Class :character 1st Qu.: 0.00
## Mode :character Mode :character Mode :character Median : 0.00
## Mean : 43.49
## 3rd Qu.: 0.00
## Max. :15500.00
##
## MoSold YrSold SaleType SaleCondition
## Min. : 1.000 Min. :2006 Length:1460 Length:1460
## 1st Qu.: 5.000 1st Qu.:2007 Class :character Class :character
## Median : 6.000 Median :2008 Mode :character Mode :character
## Mean : 6.322 Mean :2008
## 3rd Qu.: 8.000 3rd Qu.:2009
## Max. :12.000 Max. :2010
##
## SalePrice
## Min. : 34900
## 1st Qu.:129975
## Median :163000
## Mean :180921
## 3rd Qu.:214000
## Max. :755000
##
summary(test)
## Id MSSubClass MSZoning LotFrontage
## Min. :1461 Min. : 20.00 Length:1459 Min. : 21.00
## 1st Qu.:1826 1st Qu.: 20.00 Class :character 1st Qu.: 58.00
## Median :2190 Median : 50.00 Mode :character Median : 67.00
## Mean :2190 Mean : 57.38 Mean : 68.58
## 3rd Qu.:2554 3rd Qu.: 70.00 3rd Qu.: 80.00
## Max. :2919 Max. :190.00 Max. :200.00
## NA's :227
## LotArea Street Alley LotShape
## Min. : 1470 Length:1459 Length:1459 Length:1459
## 1st Qu.: 7391 Class :character Class :character Class :character
## Median : 9399 Mode :character Mode :character Mode :character
## Mean : 9819
## 3rd Qu.:11518
## Max. :56600
##
## LandContour Utilities LotConfig LandSlope
## Length:1459 Length:1459 Length:1459 Length:1459
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Neighborhood Condition1 Condition2 BldgType
## Length:1459 Length:1459 Length:1459 Length:1459
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## HouseStyle OverallQual OverallCond YearBuilt
## Length:1459 Min. : 1.000 Min. :1.000 Min. :1879
## Class :character 1st Qu.: 5.000 1st Qu.:5.000 1st Qu.:1953
## Mode :character Median : 6.000 Median :5.000 Median :1973
## Mean : 6.079 Mean :5.554 Mean :1971
## 3rd Qu.: 7.000 3rd Qu.:6.000 3rd Qu.:2001
## Max. :10.000 Max. :9.000 Max. :2010
##
## YearRemodAdd RoofStyle RoofMatl Exterior1st
## Min. :1950 Length:1459 Length:1459 Length:1459
## 1st Qu.:1963 Class :character Class :character Class :character
## Median :1992 Mode :character Mode :character Mode :character
## Mean :1984
## 3rd Qu.:2004
## Max. :2010
##
## Exterior2nd MasVnrType MasVnrArea ExterQual
## Length:1459 Length:1459 Min. : 0.0 Length:1459
## Class :character Class :character 1st Qu.: 0.0 Class :character
## Mode :character Mode :character Median : 0.0 Mode :character
## Mean : 100.7
## 3rd Qu.: 164.0
## Max. :1290.0
## NA's :15
## ExterCond Foundation BsmtQual BsmtCond
## Length:1459 Length:1459 Length:1459 Length:1459
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2
## Length:1459 Length:1459 Min. : 0.0 Length:1459
## Class :character Class :character 1st Qu.: 0.0 Class :character
## Mode :character Mode :character Median : 350.5 Mode :character
## Mean : 439.2
## 3rd Qu.: 753.5
## Max. :4010.0
## NA's :1
## BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating
## Min. : 0.00 Min. : 0.0 Min. : 0 Length:1459
## 1st Qu.: 0.00 1st Qu.: 219.2 1st Qu.: 784 Class :character
## Median : 0.00 Median : 460.0 Median : 988 Mode :character
## Mean : 52.62 Mean : 554.3 Mean :1046
## 3rd Qu.: 0.00 3rd Qu.: 797.8 3rd Qu.:1305
## Max. :1526.00 Max. :2140.0 Max. :5095
## NA's :1 NA's :1 NA's :1
## HeatingQC CentralAir Electrical X1stFlrSF
## Length:1459 Length:1459 Length:1459 Min. : 407.0
## Class :character Class :character Class :character 1st Qu.: 873.5
## Mode :character Mode :character Mode :character Median :1079.0
## Mean :1156.5
## 3rd Qu.:1382.5
## Max. :5095.0
##
## X2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath
## Min. : 0 Min. : 0.000 Min. : 407 Min. :0.0000
## 1st Qu.: 0 1st Qu.: 0.000 1st Qu.:1118 1st Qu.:0.0000
## Median : 0 Median : 0.000 Median :1432 Median :0.0000
## Mean : 326 Mean : 3.543 Mean :1486 Mean :0.4345
## 3rd Qu.: 676 3rd Qu.: 0.000 3rd Qu.:1721 3rd Qu.:1.0000
## Max. :1862 Max. :1064.000 Max. :5095 Max. :3.0000
## NA's :2
## BsmtHalfBath FullBath HalfBath BedroomAbvGr
## Min. :0.0000 Min. :0.000 Min. :0.0000 Min. :0.000
## 1st Qu.:0.0000 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.:2.000
## Median :0.0000 Median :2.000 Median :0.0000 Median :3.000
## Mean :0.0652 Mean :1.571 Mean :0.3777 Mean :2.854
## 3rd Qu.:0.0000 3rd Qu.:2.000 3rd Qu.:1.0000 3rd Qu.:3.000
## Max. :2.0000 Max. :4.000 Max. :2.0000 Max. :6.000
## NA's :2
## KitchenAbvGr KitchenQual TotRmsAbvGrd Functional
## Min. :0.000 Length:1459 Min. : 3.000 Length:1459
## 1st Qu.:1.000 Class :character 1st Qu.: 5.000 Class :character
## Median :1.000 Mode :character Median : 6.000 Mode :character
## Mean :1.042 Mean : 6.385
## 3rd Qu.:1.000 3rd Qu.: 7.000
## Max. :2.000 Max. :15.000
##
## Fireplaces FireplaceQu GarageType GarageYrBlt
## Min. :0.0000 Length:1459 Length:1459 Min. :1895
## 1st Qu.:0.0000 Class :character Class :character 1st Qu.:1959
## Median :0.0000 Mode :character Mode :character Median :1979
## Mean :0.5812 Mean :1978
## 3rd Qu.:1.0000 3rd Qu.:2002
## Max. :4.0000 Max. :2207
## NA's :78
## GarageFinish GarageCars GarageArea GarageQual
## Length:1459 Min. :0.000 Min. : 0.0 Length:1459
## Class :character 1st Qu.:1.000 1st Qu.: 318.0 Class :character
## Mode :character Median :2.000 Median : 480.0 Mode :character
## Mean :1.766 Mean : 472.8
## 3rd Qu.:2.000 3rd Qu.: 576.0
## Max. :5.000 Max. :1488.0
## NA's :1 NA's :1
## GarageCond PavedDrive WoodDeckSF OpenPorchSF
## Length:1459 Length:1459 Min. : 0.00 Min. : 0.00
## Class :character Class :character 1st Qu.: 0.00 1st Qu.: 0.00
## Mode :character Mode :character Median : 0.00 Median : 28.00
## Mean : 93.17 Mean : 48.31
## 3rd Qu.: 168.00 3rd Qu.: 72.00
## Max. :1424.00 Max. :742.00
##
## EnclosedPorch X3SsnPorch ScreenPorch PoolArea
## Min. : 0.00 Min. : 0.000 Min. : 0.00 Min. : 0.000
## 1st Qu.: 0.00 1st Qu.: 0.000 1st Qu.: 0.00 1st Qu.: 0.000
## Median : 0.00 Median : 0.000 Median : 0.00 Median : 0.000
## Mean : 24.24 Mean : 1.794 Mean : 17.06 Mean : 1.744
## 3rd Qu.: 0.00 3rd Qu.: 0.000 3rd Qu.: 0.00 3rd Qu.: 0.000
## Max. :1012.00 Max. :360.000 Max. :576.00 Max. :800.000
##
## PoolQC Fence MiscFeature MiscVal
## Length:1459 Length:1459 Length:1459 Min. : 0.00
## Class :character Class :character Class :character 1st Qu.: 0.00
## Mode :character Mode :character Mode :character Median : 0.00
## Mean : 58.17
## 3rd Qu.: 0.00
## Max. :17000.00
##
## MoSold YrSold SaleType SaleCondition
## Min. : 1.000 Min. :2006 Length:1459 Length:1459
## 1st Qu.: 4.000 1st Qu.:2007 Class :character Class :character
## Median : 6.000 Median :2008 Mode :character Mode :character
## Mean : 6.104 Mean :2008
## 3rd Qu.: 8.000 3rd Qu.:2009
## Max. :12.000 Max. :2010
##
Provide univariate descriptive statistics and appropriate plots for the training data set. Provide a scatterplot matrix for at least two of the independent variables and the dependent variable. Derive a correlation matrix for any three quantitative variables in the dataset. Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide an 80% confidence interval. Discuss the meaning of your analysis. Would you be worried about familywise error? Why or why not?
train %>%
summarize(variable = "Sale Price",
mean = mean(SalePrice),
st_dev = sd(SalePrice),
q0.25 = quantile(SalePrice, 0.25),
q0.5 = quantile(SalePrice, 0.5),
q0.75 = quantile(SalePrice, 0.75),
min = min(SalePrice),
max = max(SalePrice)) %>%
pander
| variable | mean | st_dev | q0.25 | q0.5 | q0.75 | min | max |
|---|---|---|---|---|---|---|---|
| Sale Price | 180921 | 79443 | 129975 | 163000 | 214000 | 34900 | 755000 |
#Histogram of Sale Prices
hist(train$SalePrice,
main = "Sale Price Distribution",
xlab = "Sale Price",
col = "brown1")
The Scatterplot Matrix and Correlation Matrix will compare the dependent variable Sale Price to 4 independent variables: - Year Built - Year Remodeled - First Floor Square Feet - Second Floor Square Feet
Scatterplot Matrix
df_scatter <- train %>%
dplyr::select(SalePrice, YearBuilt, YearRemodAdd, X1stFlrSF,X2ndFlrSF)
pairs(df_scatter)
Correlation matrix
c_matrix <- cor(df_scatter)
c_matrix
## SalePrice YearBuilt YearRemodAdd X1stFlrSF X2ndFlrSF
## SalePrice 1.0000000 0.52289733 0.5071010 0.6058522 0.31933380
## YearBuilt 0.5228973 1.00000000 0.5928550 0.2819859 0.01030766
## YearRemodAdd 0.5071010 0.59285498 1.0000000 0.2403793 0.14002378
## X1stFlrSF 0.6058522 0.28198586 0.2403793 1.0000000 -0.20264618
## X2ndFlrSF 0.3193338 0.01030766 0.1400238 -0.2026462 1.00000000
corrplot(c_matrix, type = "upper")
Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide an 80% confidence interval. Discuss the meaning of your analysis. Would you be worried about familywise error? Why or why not?
cor.test(df_scatter$SalePrice, df_scatter$YearBuilt, conf.level = 0.8)
##
## Pearson's product-moment correlation
##
## data: df_scatter$SalePrice and df_scatter$YearBuilt
## t = 23.424, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.4980766 0.5468619
## sample estimates:
## cor
## 0.5228973
cor.test(df_scatter$SalePrice, df_scatter$YearRemodAdd, conf.level = 0.8)
##
## Pearson's product-moment correlation
##
## data: df_scatter$SalePrice and df_scatter$YearRemodAdd
## t = 22.466, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.4817381 0.5316150
## sample estimates:
## cor
## 0.507101
cor.test(df_scatter$SalePrice, df_scatter$X1stFlrSF, conf.level = 0.8)
##
## Pearson's product-moment correlation
##
## data: df_scatter$SalePrice and df_scatter$X1stFlrSF
## t = 29.078, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.5841687 0.6266715
## sample estimates:
## cor
## 0.6058522
cor.test(df_scatter$SalePrice, df_scatter$X2ndFlrSF, conf.level = 0.8)
##
## Pearson's product-moment correlation
##
## data: df_scatter$SalePrice and df_scatter$X2ndFlrSF
## t = 12.867, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.2888681 0.3491534
## sample estimates:
## cor
## 0.3193338
The p-value for all 4 correlation tests are under 2.0e-16 so correlation is statistically significant. The strongest correlation is between the Sales Price and the First Floor square footage (0.6059) followed by the Year Built (0.5229). The Second Floor square footage has the lowest correlation of the group (0.3193).
Invert your correlation matrix from above. (This is known as the precision matrix and contains variance inflation factors on the diagonal.) Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix. Conduct LU decomposition on the matrix.
#invert the correlation matrix
inv_matrix <- inv(c_matrix)
#multiply the correclation matix by the precision matrix
c1 <- c_matrix %*% inv_matrix
#multiply the precision matrix by the correlation matrix
c2 <- inv_matrix %*% c_matrix
#LU Decomposition for both matricies
lu.decomposition(c1)
## $L
## [,1] [,2] [,3] [,4] [,5]
## [1,] 1.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0
## [2,] -2.636570e-09 1.000000e+00 0.000000e+00 0.000000e+00 0
## [3,] -2.852093e-09 -6.981081e-09 1.000000e+00 0.000000e+00 0
## [4,] -5.222103e-09 -5.862071e-09 -2.092188e-09 1.000000e+00 0
## [5,] -4.574617e-09 1.460747e-09 1.804052e-12 -5.977337e-10 1
##
## $U
## [,1] [,2] [,3] [,4] [,5]
## [1,] 1 -5.542962e-09 -2.225343e-09 -9.510496e-09 -3.102032e-09
## [2,] 0 1.000000e+00 -3.742093e-09 -8.125003e-09 -1.300352e-09
## [3,] 0 0.000000e+00 1.000000e+00 -7.009452e-09 -1.728247e-09
## [4,] 0 0.000000e+00 0.000000e+00 1.000000e+00 -2.845748e-09
## [5,] 0 0.000000e+00 0.000000e+00 0.000000e+00 1.000000e+00
lu.decomposition(c2)
## $L
## [,1] [,2] [,3] [,4] [,5]
## [1,] 1.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0
## [2,] -5.542962e-09 1.000000e+00 0.000000e+00 0.000000e+00 0
## [3,] -2.225343e-09 -3.742093e-09 1.000000e+00 0.000000e+00 0
## [4,] -9.510496e-09 -8.125003e-09 -7.009452e-09 1.000000e+00 0
## [5,] -3.102032e-09 -1.300352e-09 -1.728247e-09 -2.845748e-09 1
##
## $U
## [,1] [,2] [,3] [,4] [,5]
## [1,] 1 -2.63657e-09 -2.852093e-09 -5.222102e-09 -4.574617e-09
## [2,] 0 1.00000e+00 -6.981081e-09 -5.862071e-09 1.460747e-09
## [3,] 0 0.00000e+00 1.000000e+00 -2.092188e-09 1.804052e-12
## [4,] 0 0.00000e+00 0.000000e+00 1.000000e+00 -5.977337e-10
## [5,] 0 0.00000e+00 0.000000e+00 0.000000e+00 1.000000e+00
Many times, it makes sense to fit a closed form distribution to data. Select a variable in the Kaggle.com training dataset that is skewed to the right, shift it so that the minimum value is absolutely above zero if necessary. Then load the MASS package and run fitdistr to fit an exponential probability density function. (See https://stat.ethz.ch/R-manual/R-devel/library/MASS/html/fitdistr.html ). Find the optimal value of for this distribution, and then take 1000 samples from this exponential distribution using this value (e.g., rexp(1000, )). Plot a histogram and compare it with a histogram of your original variable. Using the exponential pdf, find the 5 th and 95 th percentiles using the cumulative distribution function (CDF). Also generate a 95% confidence interval from the empirical data, assuming normality. Finally, provide the empirical 5 th percentile and 95 th percentile of the data. Discuss.
#Histogram of Sale Prices
hist(train$SalePrice,
main = "Sale Price Distribution",
xlab = "Sale Price",
col = "brown1")
exp <- fitdistr(train$SalePrice, densfun = "exponential")
exp_est <- exp$estimate
hist(rexp(1000,exp_est),
main = "Exponential Distribution of Sales Prices",
col = "brown1",
xlab = "1,000 Samples of Sales Prices")
quantile(train$SalePrice, c(0.05,0.95))
## 5% 95%
## 88000 326100
Modeling. Build some type of multiple regression model and submit your model to the competition board. Provide your complete model summary and results with analysis. Report your Kaggle.com user name and score.
#regressor = lm(formula = Salary~YearsExperience, data = training_set)
regressor = lm(formula = SalePrice~X1stFlrSF + YearBuilt, data = train)
test_filtered <- test %>%
dplyr::select(X1stFlrSF, YearBuilt)
summary(regressor)
##
## Call:
## lm(formula = SalePrice ~ X1stFlrSF + YearBuilt, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -419062 -33106 -10242 24623 420077
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.921e+06 9.875e+04 -19.45 <2e-16 ***
## X1stFlrSF 1.023e+02 3.961e+00 25.84 <2e-16 ***
## YearBuilt 1.006e+03 5.070e+01 19.84 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 56120 on 1457 degrees of freedom
## Multiple R-squared: 0.5017, Adjusted R-squared: 0.501
## F-statistic: 733.5 on 2 and 1457 DF, p-value: < 2.2e-16
layout(matrix(c(1,2,3,4),2,2))
plot(regressor)
pred <- predict(regressor, test_filtered)
pred_df <- data.frame(Id = test$Id, SalePrice = pred)
head(pred_df)
## Id SalePrice
## 1 1461 143305.8
## 2 1462 184600.0
## 3 1463 182796.7
## 4 1464 183598.0
## 5 1465 213789.6
## 6 1466 161887.0
#export predicted values to csv
write.csv(pred_df, "LCancel_Kaggle_Submission.csv",row.names = FALSE)
I used the same fields from the correlation matix in this linear model since I know those files have a high correlation.
Residuals vs Fitted: The plots for the residuals are clustered together around the linear model.
Scale-Location The plots are also clusted together in this graph by there are more spoced out plots, showing more outliers.
Normal Q-Q This plot follows the linear model the most out of the other plots.
Kaggle Score = 0.285 Kaggle Username = ltcancel