Data 605 Final Exam

library(MASS)
library(knitr)
library(dplyr)
library(ggplot2)
library(DT)
library(reshape)
library(corrplot)
library(Rmisc)

df <- read.csv("train.csv")

Introduction

Below is the dataset of house prices available from Kaggle.com. The dataset has 1459 observations of houses in Ames, Iowa, and 79 variables potentially contributing to the house sale price.

The full dataset and dictionary are available at: https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data

#kable(head(df))
datatable(df, options = list( pageLength = 5, lengthMenu = c(5, 10, 40),   initComplete = JS(
    "function(settings, json) {",
    "$(this.api().table().header()).css({'background-color': '#01975b', 'color': '#fff'});",
    "}")), rownames=TRUE)

Part 1: Variables

Pick one of the quanititative independent variables from the training data set (train.csv) , and define that variable as X. Make sure this variable is skewed to the right! Pick the dependent variable and define it as Y.

#test variable
X1<-df$OverallQual
Y1<-df$SalePrice

plot(X1,Y1)
hist(Y1, col="blue", main="Histogram of Overall Quality")

#chosen variable
X<-df$YearBuilt
Y<-df$SalePrice

plot(X,Y, col="#4caf50", main="Scatterplot of Year Built and Sale Price", xlab = "Year Built", ylab="Sale Price")
abline(lm(Y~X), col="yellow", lwd=3) # regression line (y~x)

hist(X, col="green", main="Histogram of Year Built", xlab = "Year Built")

hist(Y, col="#80cbc4", main="Histogram of Sale Price", xlab = "Sale Price")

print("Summary of X variable: Year Built")

## [1] "Summary of X variable: Year Built"

summary(X)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1872    1954    1973    1971    2000    2010

print("Summary of Y variable: Sale Price")

## [1] "Summary of Y variable: Sale Price"

summary(Y)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   34900  130000  163000  180900  214000  755000

Part 2: Probability

Probability. Calculate as a minimum the below probabilities a through c. Assume the small letter “x” is estimated as the 3d quartile of the X variable, and the small letter “y” is estimated as the 2d quartile of the Y variable. Interpret the meaning of all probabilities. In addition, make a table of counts as shown below.

\[p_1 =p(X>x | Y>y) \]

Given an above median sale price, the probability that a house has a year built greater than the third quartile.

XQ3<-quantile(X, probs=0.75) #2000 #3rd quartile of X variable
YQ2<-quantile (Y, probs=0.50) #163000 #2nd quartile, or median, of Y variable

n<-(nrow(df))
yearbuilt<-as.numeric(df$YearBuilt)
saleprice<-as.numeric(df$SalePrice)

nYQ2<-nrow(subset(df,saleprice>YQ2))


p1<-nrow(subset(df, yearbuilt > XQ3 & saleprice>YQ2))/nYQ2
p1

## [1] 0.4436813

\[p_2 = p(X>x , Y>y) \]

Given the complete data set, the probability that a house has a year built greater than the third quartile and a sale price above median value.

p2<-nrow(subset(df, yearbuilt > XQ3 & saleprice>YQ2))/n
p2

## [1] 0.2212329

\[p_3 =p(X<x | Y>y) \]

Given an above median selling price, the probability that a house has a year built less than [less than or equal to] the third quartile.

p3<-nrow(subset(df, yearbuilt <=XQ3 & saleprice>YQ2))/nYQ2
p3

## [1] 0.5563187

c1<-nrow(subset(df, yearbuilt <=XQ3 & saleprice<=YQ2))/n
c2<-nrow(subset(df, yearbuilt <=XQ3 & saleprice>YQ2))/n
c3<-c1+c2
c4<-nrow(subset(df, yearbuilt >XQ3 & saleprice<=YQ2))/n
c5<-nrow(subset(df, yearbuilt >XQ3 & saleprice>YQ2))/n
c6<-c4+c5
c7<-c1+c4
c8<-c2+c5
c9<-c3+c6

dfcounts<-matrix(round(c(c1,c2,c3,c4,c5,c6,c7,c8,c9),3), ncol=3, nrow=3, byrow=TRUE)
colnames(dfcounts)<-c(
"<=2d quartile",
">2d quartile",
"Total")
rownames(dfcounts)<-c("<=3rd quartile",">3rd quartile","Total")

print("Quartile Matrix by Percentage")

## [1] "Quartile Matrix by Percentage"

dfcounts<-as.table(dfcounts)
dfcounts

##                <=2d quartile >2d quartile Total
## <=3rd quartile         0.473        0.277 0.751
## >3rd quartile          0.028        0.221 0.249
## Total                  0.501        0.499 1.000

print("Quartile Matrix by Count")

## [1] "Quartile Matrix by Count"

dfvals<-round(dfcounts*1460,0)
dfvals

##                <=2d quartile >2d quartile Total
## <=3rd quartile           691          404  1096
## >3rd quartile             41          323   364
## Total                    731          729  1460

Part 3: Independence

Does splitting the training data in this fashion make them independent? Let A be the new variable counting those observations above the 3d quartile for X, and let B be the new variable counting those observations for the 2d quartile for Y. Does P(A|B)=P(A)P(B)? Check mathematically, and then evaluate by running a Chi Square test for association.

papb<-c4*c5
print (paste0("p(A)*p(B)=", round(papb,5)))

## [1] "p(A)*p(B)=0.00621"

\[p(A|B)=p(X>x|Y>y)=0.444\]

\[p(A)*p(B)=0.006\]

\[p(A|B) != p(A)*p(B)\]

mat <- matrix(c(691, 404, 41, 323), 2, 2, byrow=T) 

chisq.test(mat, correct=TRUE)

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  mat
## X-squared = 291.61, df = 1, p-value < 2.2e-16

#test of alternate chi sq approach
A<-subset(df, df$YearBuilt>XQ3)
B<-subset(df, df$SalePrice>YQ2)
chisq.test(A, B) #issue with variable class

Part 4: Statistics

Provide univariate descriptive statistics and appropriate plots for the training data set. Provide a scatterplot of X and Y.

Also see Part 1.

isnum <- sapply(df, is.numeric)
dfnum<-df[ , isnum]
summary(dfnum)

##        Id           MSSubClass     LotFrontage        LotArea      
##  Min.   :   1.0   Min.   : 20.0   Min.   : 21.00   Min.   :  1300  
##  1st Qu.: 365.8   1st Qu.: 20.0   1st Qu.: 59.00   1st Qu.:  7554  
##  Median : 730.5   Median : 50.0   Median : 69.00   Median :  9478  
##  Mean   : 730.5   Mean   : 56.9   Mean   : 70.05   Mean   : 10517  
##  3rd Qu.:1095.2   3rd Qu.: 70.0   3rd Qu.: 80.00   3rd Qu.: 11602  
##  Max.   :1460.0   Max.   :190.0   Max.   :313.00   Max.   :215245  
##                                   NA's   :259                      
##   OverallQual      OverallCond      YearBuilt     YearRemodAdd 
##  Min.   : 1.000   Min.   :1.000   Min.   :1872   Min.   :1950  
##  1st Qu.: 5.000   1st Qu.:5.000   1st Qu.:1954   1st Qu.:1967  
##  Median : 6.000   Median :5.000   Median :1973   Median :1994  
##  Mean   : 6.099   Mean   :5.575   Mean   :1971   Mean   :1985  
##  3rd Qu.: 7.000   3rd Qu.:6.000   3rd Qu.:2000   3rd Qu.:2004  
##  Max.   :10.000   Max.   :9.000   Max.   :2010   Max.   :2010  
##                                                                
##    MasVnrArea       BsmtFinSF1       BsmtFinSF2        BsmtUnfSF     
##  Min.   :   0.0   Min.   :   0.0   Min.   :   0.00   Min.   :   0.0  
##  1st Qu.:   0.0   1st Qu.:   0.0   1st Qu.:   0.00   1st Qu.: 223.0  
##  Median :   0.0   Median : 383.5   Median :   0.00   Median : 477.5  
##  Mean   : 103.7   Mean   : 443.6   Mean   :  46.55   Mean   : 567.2  
##  3rd Qu.: 166.0   3rd Qu.: 712.2   3rd Qu.:   0.00   3rd Qu.: 808.0  
##  Max.   :1600.0   Max.   :5644.0   Max.   :1474.00   Max.   :2336.0  
##  NA's   :8                                                           
##   TotalBsmtSF       X1stFlrSF      X2ndFlrSF     LowQualFinSF    
##  Min.   :   0.0   Min.   : 334   Min.   :   0   Min.   :  0.000  
##  1st Qu.: 795.8   1st Qu.: 882   1st Qu.:   0   1st Qu.:  0.000  
##  Median : 991.5   Median :1087   Median :   0   Median :  0.000  
##  Mean   :1057.4   Mean   :1163   Mean   : 347   Mean   :  5.845  
##  3rd Qu.:1298.2   3rd Qu.:1391   3rd Qu.: 728   3rd Qu.:  0.000  
##  Max.   :6110.0   Max.   :4692   Max.   :2065   Max.   :572.000  
##                                                                  
##    GrLivArea     BsmtFullBath     BsmtHalfBath        FullBath    
##  Min.   : 334   Min.   :0.0000   Min.   :0.00000   Min.   :0.000  
##  1st Qu.:1130   1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:1.000  
##  Median :1464   Median :0.0000   Median :0.00000   Median :2.000  
##  Mean   :1515   Mean   :0.4253   Mean   :0.05753   Mean   :1.565  
##  3rd Qu.:1777   3rd Qu.:1.0000   3rd Qu.:0.00000   3rd Qu.:2.000  
##  Max.   :5642   Max.   :3.0000   Max.   :2.00000   Max.   :3.000  
##                                                                   
##     HalfBath       BedroomAbvGr    KitchenAbvGr    TotRmsAbvGrd   
##  Min.   :0.0000   Min.   :0.000   Min.   :0.000   Min.   : 2.000  
##  1st Qu.:0.0000   1st Qu.:2.000   1st Qu.:1.000   1st Qu.: 5.000  
##  Median :0.0000   Median :3.000   Median :1.000   Median : 6.000  
##  Mean   :0.3829   Mean   :2.866   Mean   :1.047   Mean   : 6.518  
##  3rd Qu.:1.0000   3rd Qu.:3.000   3rd Qu.:1.000   3rd Qu.: 7.000  
##  Max.   :2.0000   Max.   :8.000   Max.   :3.000   Max.   :14.000  
##                                                                   
##    Fireplaces     GarageYrBlt     GarageCars      GarageArea    
##  Min.   :0.000   Min.   :1900   Min.   :0.000   Min.   :   0.0  
##  1st Qu.:0.000   1st Qu.:1961   1st Qu.:1.000   1st Qu.: 334.5  
##  Median :1.000   Median :1980   Median :2.000   Median : 480.0  
##  Mean   :0.613   Mean   :1979   Mean   :1.767   Mean   : 473.0  
##  3rd Qu.:1.000   3rd Qu.:2002   3rd Qu.:2.000   3rd Qu.: 576.0  
##  Max.   :3.000   Max.   :2010   Max.   :4.000   Max.   :1418.0  
##                  NA's   :81                                     
##    WoodDeckSF      OpenPorchSF     EnclosedPorch      X3SsnPorch    
##  Min.   :  0.00   Min.   :  0.00   Min.   :  0.00   Min.   :  0.00  
##  1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.00  
##  Median :  0.00   Median : 25.00   Median :  0.00   Median :  0.00  
##  Mean   : 94.24   Mean   : 46.66   Mean   : 21.95   Mean   :  3.41  
##  3rd Qu.:168.00   3rd Qu.: 68.00   3rd Qu.:  0.00   3rd Qu.:  0.00  
##  Max.   :857.00   Max.   :547.00   Max.   :552.00   Max.   :508.00  
##                                                                     
##   ScreenPorch        PoolArea          MiscVal             MoSold      
##  Min.   :  0.00   Min.   :  0.000   Min.   :    0.00   Min.   : 1.000  
##  1st Qu.:  0.00   1st Qu.:  0.000   1st Qu.:    0.00   1st Qu.: 5.000  
##  Median :  0.00   Median :  0.000   Median :    0.00   Median : 6.000  
##  Mean   : 15.06   Mean   :  2.759   Mean   :   43.49   Mean   : 6.322  
##  3rd Qu.:  0.00   3rd Qu.:  0.000   3rd Qu.:    0.00   3rd Qu.: 8.000  
##  Max.   :480.00   Max.   :738.000   Max.   :15500.00   Max.   :12.000  
##                                                                        
##      YrSold       SalePrice     
##  Min.   :2006   Min.   : 34900  
##  1st Qu.:2007   1st Qu.:129975  
##  Median :2008   Median :163000  
##  Mean   :2008   Mean   :180921  
##  3rd Qu.:2009   3rd Qu.:214000  
##  Max.   :2010   Max.   :755000  
##

Confidence interval

Provide a 95% CI for the difference in the mean of the variables.

#t.test(x,y)
t.test(df$YearBuilt, df$SalePrice)

## 
##  Welch Two Sample t-test
## 
## data:  df$YearBuilt and df$SalePrice
## t = -86.071, df = 1459, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -183028.3 -174871.6
## sample estimates:
##  mean of x  mean of y 
##   1971.268 180921.196

Selective correlation matrix for chosen variables

Derive a correlation matrix for two of the quantitative variables you selected.

Test the hypothesis that the correlation between these variables is 0 and provide a 99% confidence interval. Discuss the meaning of your analysis.

myvars<-data.frame(df$YearBuilt, df$SalePrice)
#head(myvars) #view header
cor(myvars)

##              df.YearBuilt df.SalePrice
## df.YearBuilt    1.0000000    0.5228973
## df.SalePrice    0.5228973    1.0000000

cor.test(df$YearBuilt, df$SalePrice, conf.level = 0.99)

## 
##  Pearson's product-moment correlation
## 
## data:  df$YearBuilt and df$SalePrice
## t = 23.424, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 99 percent confidence interval:
##  0.4721529 0.5701878
## sample estimates:
##       cor 
## 0.5228973

t.test(df$YearBuilt, df$SalePrice, conf.level = 0.99)

## 
##  Welch Two Sample t-test
## 
## data:  df$YearBuilt and df$SalePrice
## t = -86.071, df = 1459, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 99 percent confidence interval:
##  -184312.4 -173587.5
## sample estimates:
##  mean of x  mean of y 
##   1971.268 180921.196

mymx<-as.matrix(cor(myvars))

With a 99 percent confidence level, the correlation between Year Built and Sale Price is estimated to be between 0.47 and 0.57.

Part 5: Correlation

Linear Algebra and Correlation. Invert your correlation matrix. (This is known as the precision matrix and contains variance inflation factors on the diagonal.)

Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix.

Correlation Matrix, Precision Matrix, Identity Matrix

#my correlation matrix
mymx

##              df.YearBuilt df.SalePrice
## df.YearBuilt    1.0000000    0.5228973
## df.SalePrice    0.5228973    1.0000000

#inverse of my correlation matrix, precision matrix
ginvmymx<-ginv(mymx)
ginvmymx

##            [,1]       [,2]
## [1,]  1.3763140 -0.7196709
## [2,] -0.7196709  1.3763140

#corr mat x precision mat
mymxginv<-mymx%*%ginvmymx
round(mymxginv,2)

##              [,1] [,2]
## df.YearBuilt    1    0
## df.SalePrice    0    1

#precision mat x corr mat
ginvmymx<-ginvmymx%*%mymx
round(ginvmymx,2)

##      df.YearBuilt df.SalePrice
## [1,]            1            0
## [2,]            0            1

Principal Components Analysis

Conduct principal components analysis (research this!) and interpret. Discuss.

Header of all quantitative variables

#Correlation matrix of all quantitative variables in dataframe

kable(head(dfnum))

Id	MSSubClass	LotFrontage	LotArea	OverallQual	OverallCond	YearBuilt	YearRemodAdd	MasVnrArea	BsmtFinSF1	BsmtUnfSF	TotalBsmtSF	X1stFlrSF	X2ndFlrSF	GrLivArea	BsmtFullBath	BsmtHalfBath	FullBath	HalfBath	BedroomAbvGr	KitchenAbvGr	TotRmsAbvGrd	Fireplaces	GarageYrBlt	GarageCars	GarageArea	WoodDeckSF	OpenPorchSF	EnclosedPorch	X3SsnPorch	MiscVal	MoSold	YrSold	SalePrice
1	60	65	8450	7	5	2003	2003	196	706	150	856	856	854	1710	1	0	2	1	3	1	8	0	2003	2	548	0	61	0	0	0	2	2008	208500
2	20	80	9600	6	8	1976	1976	0	978	284	1262	1262	0	1262	0	1	2	0	3	1	6	1	1976	2	460	298	0	0	0	0	5	2007	181500
3	60	68	11250	7	5	2001	2002	162	486	434	920	920	866	1786	1	0	2	1	3	1	6	1	2001	2	608	0	42	0	0	0	9	2008	223500
4	70	60	9550	7	5	1915	1970	0	216	540	756	961	756	1717	1	0	1	0	3	1	7	1	1998	3	642	0	35	272	0	0	2	2006	140000
5	60	84	14260	8	5	2000	2000	350	655	490	1145	1145	1053	2198	1	0	2	1	4	1	9	1	2000	3	836	192	84	0	0	0	12	2008	250000
6	50	85	14115	5	5	1993	1995	0	732	64	796	796	566	1362	1	0	1	1	1	1	5	0	1993	2	480	40	30	0	320	700	10	2009	143000

Header of correlation matrix for all quantitative variables

cormatrix<-cor(dfnum)
cordf<-as.data.frame(cormatrix)
kable(head(cordf))

	Id	MSSubClass	LotFrontage	LotArea	OverallQual	OverallCond	YearBuilt	YearRemodAdd	MasVnrArea	BsmtFinSF1	BsmtFinSF2	BsmtUnfSF	TotalBsmtSF	X1stFlrSF	X2ndFlrSF	LowQualFinSF	GrLivArea	BsmtFullBath	BsmtHalfBath	FullBath	HalfBath	BedroomAbvGr	KitchenAbvGr	TotRmsAbvGrd	Fireplaces	GarageYrBlt	GarageCars	GarageArea	WoodDeckSF	OpenPorchSF	EnclosedPorch	X3SsnPorch	ScreenPorch	PoolArea	MiscVal	MoSold	YrSold	SalePrice
Id	1.0000000	0.0111565	NA	-0.0332255	-0.0283648	0.0126089	-0.0127127	-0.0219976	NA	-0.0050240	-0.0059677	-0.0079397	-0.0154146	0.0104960	0.0055898	-0.0442300	0.0082728	0.0022886	-0.0201547	0.0055875	0.0067838	0.0377186	0.0029512	0.0272387	-0.0197716	NA	0.0165697	0.0176338	-0.0296432	-0.0004769	0.0028892	-0.0466348	0.0013302	0.0570439	-0.0062424	0.0211722	0.0007118	-0.0219167
MSSubClass	0.0111565	1.0000000	NA	-0.1397811	0.0326277	-0.0593158	0.0278501	0.0405810	NA	-0.0698357	-0.0656486	-0.1407595	-0.2385184	-0.2517584	0.3078857	0.0464738	0.0748532	0.0034910	-0.0023325	0.1316082	0.1773544	-0.0234380	0.2817210	0.0403801	-0.0455693	NA	-0.0401098	-0.0986715	-0.0125794	-0.0061001	-0.0120366	-0.0438245	-0.0260302	0.0082827	-0.0076833	-0.0135846	-0.0214070	-0.0842841
LotFrontage	NA	NA	1	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA
LotArea	-0.0332255	-0.1397811	NA	1.0000000	0.1058057	-0.0056363	0.0142277	0.0137884	NA	0.2141031	0.1111697	-0.0026184	0.2608331	0.2994746	0.0509859	0.0047790	0.2631162	0.1581545	0.0480456	0.1260306	0.0142595	0.1196899	-0.0177839	0.1900148	0.2713640	NA	0.1548707	0.1804028	0.1716977	0.0847738	-0.0183397	0.0204228	0.0431604	0.0776724	0.0380677	0.0012050	-0.0142614	0.2638434
OverallQual	-0.0283648	0.0326277	NA	0.1058057	1.0000000	-0.0919323	0.5723228	0.5506839	NA	0.2396660	-0.0591187	0.3081589	0.5378085	0.4762238	0.2954929	-0.0304293	0.5930074	0.1110978	-0.0401502	0.5505997	0.2734581	0.1016764	-0.1838822	0.4274523	0.3967650	NA	0.6006707	0.5620218	0.2389234	0.3088188	-0.1139369	0.0303706	0.0648864	0.0651658	-0.0314062	0.0708152	-0.0273467	0.7909816
OverallCond	0.0126089	-0.0593158	NA	-0.0056363	-0.0919323	1.0000000	-0.3759832	0.0737415	NA	-0.0462309	0.0402292	-0.1368406	-0.1710975	-0.1442028	0.0289421	0.0254943	-0.0796859	-0.0549415	0.1178209	-0.1941495	-0.0607693	0.0129801	-0.0870009	-0.0575832	-0.0238200	NA	-0.1857575	-0.1515214	-0.0033337	-0.0325888	0.0703562	0.0255037	0.0548105	-0.0019849	0.0687768	-0.0035108	0.0439497	-0.0778559

Header of variables with correlation greater than 0.5

#Source from http://stackoverflow.com/questions/7074246/show-correlations-as-an-ordered-list-not-as-a-large-matrix

cordf[cordf == 1] <- NA #drop correlation of 1, diagonals
cordf[abs(cordf) < 0.5] <- NA # drop correlations of less than 0.5
cordf<-as.matrix(cordf)
cordf2<- na.omit(melt(cordf)) 
kable(head(cordf2[order(-abs(cordf2$value)),])) # sort by highest correlations

	X1	X2	value
1016	GarageArea	GarageCars	0.8824754
1053	GarageCars	GarageArea	0.8824754
632	TotRmsAbvGrd	GrLivArea	0.8254894
891	GrLivArea	TotRmsAbvGrd	0.8254894
470	X1stFlrSF	TotalBsmtSF	0.8195300
507	TotalBsmtSF	X1stFlrSF	0.8195300

#corrplot(cordf, type = "upper", tl.col = "black", tl.srt = 45)

#test of alternate corr approach
myvars<-data.frame(df$YearBuilt, df$SalePrice)
head(myvars)

All variables with correlation to Sale Price greater than 0.5

cordf2<-as.data.frame(cordf2)
# head(cordf2) #view head
# str(cordf2) #view structure
topcors <- cordf2[ which(cordf2$X2=='SalePrice'),]

topcorsdf<-topcors[order(-abs(topcors$value)),]# sort by highest correlations

cors1<-data.frame(topcorsdf$X1,topcorsdf$X2,topcorsdf$value)
kable(cors1)

topcorsdf.X1	topcorsdf.X2	topcorsdf.value
OverallQual	SalePrice	0.7909816
GrLivArea	SalePrice	0.7086245
GarageCars	SalePrice	0.6404092
GarageArea	SalePrice	0.6234314
TotalBsmtSF	SalePrice	0.6135806
X1stFlrSF	SalePrice	0.6058522
FullBath	SalePrice	0.5606638
TotRmsAbvGrd	SalePrice	0.5337232
YearBuilt	SalePrice	0.5228973
YearRemodAdd	SalePrice	0.5071010

Plot of correlation to Sale Price

par(mar=c(8,8,1,1))
barplot(topcorsdf$value, ylab="Correlation to Sale Price", ylim=c(0,1), col=rainbow(20), las=2, names.arg=topcorsdf$X1)

Variables with strongest correlation to Sale Price in descending order:

OverallQual
GrLivArea
GarageCars
GarageArea
TotalBsmtSF
X1stFlrSF
FullBath
TotRmsAbvGrd
YearBuilt
YearRemodAdd

cormatdata <- select(df, OverallQual, GrLivArea, GarageCars, GarageArea, TotalBsmtSF, X1stFlrSF, FullBath, TotRmsAbvGrd)

## Warning in combine_vars(vars, ind_list): '.Random.seed' is not an integer
## vector but of type 'NULL', so ignored

cormat1 <- cor(cormatdata)
cormat1

##              OverallQual GrLivArea GarageCars GarageArea TotalBsmtSF
## OverallQual    1.0000000 0.5930074  0.6006707  0.5620218   0.5378085
## GrLivArea      0.5930074 1.0000000  0.4672474  0.4689975   0.4548682
## GarageCars     0.6006707 0.4672474  1.0000000  0.8824754   0.4345848
## GarageArea     0.5620218 0.4689975  0.8824754  1.0000000   0.4866655
## TotalBsmtSF    0.5378085 0.4548682  0.4345848  0.4866655   1.0000000
## X1stFlrSF      0.4762238 0.5660240  0.4393168  0.4897817   0.8195300
## FullBath       0.5505997 0.6300116  0.4696720  0.4056562   0.3237224
## TotRmsAbvGrd   0.4274523 0.8254894  0.3622886  0.3378221   0.2855726
##              X1stFlrSF  FullBath TotRmsAbvGrd
## OverallQual  0.4762238 0.5505997    0.4274523
## GrLivArea    0.5660240 0.6300116    0.8254894
## GarageCars   0.4393168 0.4696720    0.3622886
## GarageArea   0.4897817 0.4056562    0.3378221
## TotalBsmtSF  0.8195300 0.3237224    0.2855726
## X1stFlrSF    1.0000000 0.3806375    0.4095160
## FullBath     0.3806375 1.0000000    0.5547843
## TotRmsAbvGrd 0.4095160 0.5547843    1.0000000

corrplot(cormat1, method="circle")

Part 6: Sampling

Calculus-Based Probability & Statistics. Many times, it makes sense to fit a closed form distribution to data.

For your variable that is skewed to the right, shift it so that the minimum value is above zero. Then load the MASS package and run fitdistr to fit an exponential probability density function. (See https://stat.ethz.ch/R-manual/R-devel/library/MASS/html/fitdistr.html ).

Minimum value is above zero

#check that min val is not 0
min(df$YearBuilt)

## [1] 1872

Run fitdistr to fit an exponential probability density function.

fit <- fitdistr(df$YearBuilt, "exponential")

Find the optimal value of λ for this distribution, and then take 1000 samples from this exponential distribution using this value (e.g., rexp(1000, λ)).

#optimal value of λ for this distribution

lambda <- fit$estimate
sampledf <- rexp(1000, lambda)
lambda

##         rate 
## 0.0005072877

Plot a histogram and compare it with a histogram of your original variable.

#Plot a histogram and compare it with a histogram of your original variable.

sampledf<-data.frame(as.numeric(sampledf))
colnames(sampledf)[1] <- "sample"
str(sampledf)

## 'data.frame':    1000 obs. of  1 variable:
##  $ sample: num  1253 2534 6874 6833 43724 ...

head(sampledf)

##       sample
## 1  1252.9546
## 2  2534.0248
## 3  6874.1797
## 4  6833.2280
## 5 43724.1191
## 6   275.5495

hist(sampledf$sample, col="green", main="Histogram of Exponential Distribution", xlab = "Year Built", breaks=30)

Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function (CDF).

#find the 5th and 95th percentiles
print("5th percentile")

## [1] "5th percentile"

qexp(.05,rate = lambda)

## [1] 101.1128

print("95th percentile")

## [1] "95th percentile"

qexp(.95, rate = lambda)

## [1] 5905.391

Also generate a 95% confidence interval from the empirical data, assuming normality.

#95% confidence interval from the empirical data
CI(df$YearBuilt, 0.95)

##    upper     mean    lower 
## 1972.818 1971.268 1969.717

Finally, provide the empirical 5th percentile and 95th percentile of the data. Discuss.

quantile(df$YearBuilt, .05)

##   5% 
## 1916

quantile(df$YearBuilt, .95)

##  95% 
## 2007

Part 7: Modeling

Modeling. Build some type of regression model and submit your model to the competition board. Provide your complete model summary and results with analysis. Report your Kaggle.com user name and score.

Test Model 1: AIC in a Stepwise Algorithm

#test of alternate model
modvars <- df[, which(sapply(df, function(x) sum(is.na(x))) == 0)]
model1 <- step(lm(df$SalePrice ~ ., modvars), direction = 'backward', trace = FALSE)
model1

#dfglm <- glm(df$SalePrice ~ ., family=binomial, data = df)
#dfstep <- stepAIC(dfglm, trace = FALSE)
#dfstep$anova

Test Model 2: Multiple Linear Regression

fit <- lm(df$SalePrice ~ df$OverallQual + df$GrLivArea + df$GarageCars + df$GarageArea, data=df)
summary(fit) # show results

## 
## Call:
## lm(formula = df$SalePrice ~ df$OverallQual + df$GrLivArea + df$GarageCars + 
##     df$GarageArea, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -372594  -21236   -1594   18625  301129 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    -98436.050   4820.467 -20.420  < 2e-16 ***
## df$OverallQual  26988.854   1067.393  25.285  < 2e-16 ***
## df$GrLivArea       49.573      2.555  19.402  < 2e-16 ***
## df$GarageCars   11317.522   3126.297   3.620 0.000305 ***
## df$GarageArea      41.478     10.627   3.903 9.93e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 40420 on 1455 degrees of freedom
## Multiple R-squared:  0.7418, Adjusted R-squared:  0.7411 
## F-statistic:  1045 on 4 and 1455 DF,  p-value: < 2.2e-16

Using intercepts from regression summary, create multiple linear regression model.

\[ SalePrice = 26988.854*OverallQual + 49.573*GrLivArea + 11317.522*GarageCars + 41.478*GarageArea -98436.050 \]

par(mfrow=c(2,2))
X1<-df$OverallQual
X2<-df$GrLivArea
X3<-df$GarageCars
X4<-df$GarageArea
Y<-df$SalePrice

plot(X1,Y, col="#f06292", main="OverallQual", ylab="Sale Price")
abline(lm(Y~X1), col="yellow", lwd=3) # regression line (y~x)

plot(X2,Y, col="#9c27b0", main="GrLivArea", ylab="Sale Price")
abline(lm(Y~X2), col="yellow", lwd=3) # regression line (y~x)

plot(X3,Y, col="#ce93d8", main="GarageCars", ylab="Sale Price")
abline(lm(Y~X3), col="yellow", lwd=3) # regression line (y~x)

plot(X4,Y, col="#c2185b", main="GarageArea", ylab="Sale Price")
abline(lm(Y~X4), col="yellow", lwd=3) # regression line (y~x)

Load test data set and create calculated column using equation for multiple linear regression. Select required columns and export to csv for contest entry.

dftest <- read.csv("test.csv")
#str(dftest)
#nrow(dftest)

SalePrice<-((26988.854*df$OverallQual) + (49.573*df$GrLivArea) +  (11317.522*df$GarageCars) + (41.478*df$GarageArea) -98436.050)
#head(SalePrice)

dftest<-dftest[,c("Id","OverallQual","GrLivArea","GarageCars","GarageArea")]

kable(head(dftest))

Id	OverallQual	GrLivArea	GarageCars	GarageArea
1461	5	896	1	730
1462	6	1329	1	312
1463	5	1629	2	482
1464	6	1604	2	470
1465	8	1280	2	506
1466	6	1655	2	440

#tail(dftest)

submission <- cbind(dftest$Id,SalePrice)

## Warning in cbind(dftest$Id, SalePrice): number of rows of result is not a
## multiple of vector length (arg 1)

colnames(submission)[1] <- "Id"
submission[submission<0] <- median(SalePrice) #clear negatives due to missing values
submission<-as.data.frame(submission[1:1459,])
kable(head(submission))

Id	SalePrice
1461	220620.7
1462	167773.1
1463	226877.0
1464	236184.2
1465	295064.4
1466	146571.1

#str(submission)
#dim(submission)

Export CSV and submit to Kaggle.

Eval set to FALSE for reader convenience.

write.csv(submission, file = "submissionAAP.csv", quote=FALSE, row.names=FALSE)

Kaggle score: 0.60114