605 Final - House Prices: Advanced Regression Techniques

Data Source: https://www.kaggle.com/c/house-prices-advanced-regression-techniques

#Download required packages
suppressWarnings(suppressMessages(library(RCurl)))
suppressWarnings(suppressMessages(library(dplyr)))
suppressWarnings(suppressMessages(library(XML)))
suppressWarnings(suppressMessages(library(Hmisc))) # correlation
suppressWarnings(suppressMessages(library(ggplot2)))

The Data

#Prepare raw data sets: read in csv or txt files into R 
housing_data <- read.csv(file="C:/Users/Ada/Desktop/CUNY_SPS_DA/605_02_Comp_Math_R/605 final- housing data fr kaggle/train.csv", header=TRUE, sep=",")

#summary(housing_data) #review raw data
dim(housing_data)

## [1] 1460   81

str(housing_data)

## 'data.frame':    1460 obs. of  81 variables:
##  $ Id           : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ MSSubClass   : int  60 20 60 70 60 50 20 60 50 190 ...
##  $ MSZoning     : Factor w/ 5 levels "C (all)","FV",..: 4 4 4 4 4 4 4 4 5 4 ...
##  $ LotFrontage  : int  65 80 68 60 84 85 75 NA 51 50 ...
##  $ LotArea      : int  8450 9600 11250 9550 14260 14115 10084 10382 6120 7420 ...
##  $ Street       : Factor w/ 2 levels "Grvl","Pave": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Alley        : Factor w/ 2 levels "Grvl","Pave": NA NA NA NA NA NA NA NA NA NA ...
##  $ LotShape     : Factor w/ 4 levels "IR1","IR2","IR3",..: 4 4 1 1 1 1 4 1 4 4 ...
##  $ LandContour  : Factor w/ 4 levels "Bnk","HLS","Low",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ Utilities    : Factor w/ 2 levels "AllPub","NoSeWa": 1 1 1 1 1 1 1 1 1 1 ...
##  $ LotConfig    : Factor w/ 5 levels "Corner","CulDSac",..: 5 3 5 1 3 5 5 1 5 1 ...
##  $ LandSlope    : Factor w/ 3 levels "Gtl","Mod","Sev": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Neighborhood : Factor w/ 25 levels "Blmngtn","Blueste",..: 6 25 6 7 14 12 21 17 18 4 ...
##  $ Condition1   : Factor w/ 9 levels "Artery","Feedr",..: 3 2 3 3 3 3 3 5 1 1 ...
##  $ Condition2   : Factor w/ 8 levels "Artery","Feedr",..: 3 3 3 3 3 3 3 3 3 1 ...
##  $ BldgType     : Factor w/ 5 levels "1Fam","2fmCon",..: 1 1 1 1 1 1 1 1 1 2 ...
##  $ HouseStyle   : Factor w/ 8 levels "1.5Fin","1.5Unf",..: 6 3 6 6 6 1 3 6 1 2 ...
##  $ OverallQual  : int  7 6 7 7 8 5 8 7 7 5 ...
##  $ OverallCond  : int  5 8 5 5 5 5 5 6 5 6 ...
##  $ YearBuilt    : int  2003 1976 2001 1915 2000 1993 2004 1973 1931 1939 ...
##  $ YearRemodAdd : int  2003 1976 2002 1970 2000 1995 2005 1973 1950 1950 ...
##  $ RoofStyle    : Factor w/ 6 levels "Flat","Gable",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ RoofMatl     : Factor w/ 8 levels "ClyTile","CompShg",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ Exterior1st  : Factor w/ 15 levels "AsbShng","AsphShn",..: 13 9 13 14 13 13 13 7 4 9 ...
##  $ Exterior2nd  : Factor w/ 16 levels "AsbShng","AsphShn",..: 14 9 14 16 14 14 14 7 16 9 ...
##  $ MasVnrType   : Factor w/ 4 levels "BrkCmn","BrkFace",..: 2 3 2 3 2 3 4 4 3 3 ...
##  $ MasVnrArea   : int  196 0 162 0 350 0 186 240 0 0 ...
##  $ ExterQual    : Factor w/ 4 levels "Ex","Fa","Gd",..: 3 4 3 4 3 4 3 4 4 4 ...
##  $ ExterCond    : Factor w/ 5 levels "Ex","Fa","Gd",..: 5 5 5 5 5 5 5 5 5 5 ...
##  $ Foundation   : Factor w/ 6 levels "BrkTil","CBlock",..: 3 2 3 1 3 6 3 2 1 1 ...
##  $ BsmtQual     : Factor w/ 4 levels "Ex","Fa","Gd",..: 3 3 3 4 3 3 1 3 4 4 ...
##  $ BsmtCond     : Factor w/ 4 levels "Fa","Gd","Po",..: 4 4 4 2 4 4 4 4 4 4 ...
##  $ BsmtExposure : Factor w/ 4 levels "Av","Gd","Mn",..: 4 2 3 4 1 4 1 3 4 4 ...
##  $ BsmtFinType1 : Factor w/ 6 levels "ALQ","BLQ","GLQ",..: 3 1 3 1 3 3 3 1 6 3 ...
##  $ BsmtFinSF1   : int  706 978 486 216 655 732 1369 859 0 851 ...
##  $ BsmtFinType2 : Factor w/ 6 levels "ALQ","BLQ","GLQ",..: 6 6 6 6 6 6 6 2 6 6 ...
##  $ BsmtFinSF2   : int  0 0 0 0 0 0 0 32 0 0 ...
##  $ BsmtUnfSF    : int  150 284 434 540 490 64 317 216 952 140 ...
##  $ TotalBsmtSF  : int  856 1262 920 756 1145 796 1686 1107 952 991 ...
##  $ Heating      : Factor w/ 6 levels "Floor","GasA",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ HeatingQC    : Factor w/ 5 levels "Ex","Fa","Gd",..: 1 1 1 3 1 1 1 1 3 1 ...
##  $ CentralAir   : Factor w/ 2 levels "N","Y": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Electrical   : Factor w/ 5 levels "FuseA","FuseF",..: 5 5 5 5 5 5 5 5 2 5 ...
##  $ X1stFlrSF    : int  856 1262 920 961 1145 796 1694 1107 1022 1077 ...
##  $ X2ndFlrSF    : int  854 0 866 756 1053 566 0 983 752 0 ...
##  $ LowQualFinSF : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ GrLivArea    : int  1710 1262 1786 1717 2198 1362 1694 2090 1774 1077 ...
##  $ BsmtFullBath : int  1 0 1 1 1 1 1 1 0 1 ...
##  $ BsmtHalfBath : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ FullBath     : int  2 2 2 1 2 1 2 2 2 1 ...
##  $ HalfBath     : int  1 0 1 0 1 1 0 1 0 0 ...
##  $ BedroomAbvGr : int  3 3 3 3 4 1 3 3 2 2 ...
##  $ KitchenAbvGr : int  1 1 1 1 1 1 1 1 2 2 ...
##  $ KitchenQual  : Factor w/ 4 levels "Ex","Fa","Gd",..: 3 4 3 3 3 4 3 4 4 4 ...
##  $ TotRmsAbvGrd : int  8 6 6 7 9 5 7 7 8 5 ...
##  $ Functional   : Factor w/ 7 levels "Maj1","Maj2",..: 7 7 7 7 7 7 7 7 3 7 ...
##  $ Fireplaces   : int  0 1 1 1 1 0 1 2 2 2 ...
##  $ FireplaceQu  : Factor w/ 5 levels "Ex","Fa","Gd",..: NA 5 5 3 5 NA 3 5 5 5 ...
##  $ GarageType   : Factor w/ 6 levels "2Types","Attchd",..: 2 2 2 6 2 2 2 2 6 2 ...
##  $ GarageYrBlt  : int  2003 1976 2001 1998 2000 1993 2004 1973 1931 1939 ...
##  $ GarageFinish : Factor w/ 3 levels "Fin","RFn","Unf": 2 2 2 3 2 3 2 2 3 2 ...
##  $ GarageCars   : int  2 2 2 3 3 2 2 2 2 1 ...
##  $ GarageArea   : int  548 460 608 642 836 480 636 484 468 205 ...
##  $ GarageQual   : Factor w/ 5 levels "Ex","Fa","Gd",..: 5 5 5 5 5 5 5 5 2 3 ...
##  $ GarageCond   : Factor w/ 5 levels "Ex","Fa","Gd",..: 5 5 5 5 5 5 5 5 5 5 ...
##  $ PavedDrive   : Factor w/ 3 levels "N","P","Y": 3 3 3 3 3 3 3 3 3 3 ...
##  $ WoodDeckSF   : int  0 298 0 0 192 40 255 235 90 0 ...
##  $ OpenPorchSF  : int  61 0 42 35 84 30 57 204 0 4 ...
##  $ EnclosedPorch: int  0 0 0 272 0 0 0 228 205 0 ...
##  $ X3SsnPorch   : int  0 0 0 0 0 320 0 0 0 0 ...
##  $ ScreenPorch  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PoolArea     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PoolQC       : Factor w/ 3 levels "Ex","Fa","Gd": NA NA NA NA NA NA NA NA NA NA ...
##  $ Fence        : Factor w/ 4 levels "GdPrv","GdWo",..: NA NA NA NA NA 3 NA NA NA NA ...
##  $ MiscFeature  : Factor w/ 4 levels "Gar2","Othr",..: NA NA NA NA NA 3 NA 3 NA NA ...
##  $ MiscVal      : int  0 0 0 0 0 700 0 350 0 0 ...
##  $ MoSold       : int  2 5 9 2 12 10 8 11 4 1 ...
##  $ YrSold       : int  2008 2007 2008 2006 2008 2009 2007 2009 2008 2008 ...
##  $ SaleType     : Factor w/ 9 levels "COD","Con","ConLD",..: 9 9 9 9 9 9 9 9 9 9 ...
##  $ SaleCondition: Factor w/ 6 levels "Abnorml","AdjLand",..: 5 5 5 1 5 5 5 5 1 5 ...
##  $ SalePrice    : int  208500 181500 223500 140000 250000 143000 307000 200000 129900 118000 ...

Choose an independent variable X that is high corrected to SalePrice Y: GrLivArea

#pick all numerical columns in the table
num_col <- select_if(housing_data, is.numeric)

# let SalePrice as a dependent variable y
Y <- num_col$SalePrice

#choose a variable as an independent value, x, that has max correlation to salePrice 
head(sort(abs(cor(num_col))[,ncol(abs(cor(num_col)))],decreasing = TRUE))

##   SalePrice OverallQual   GrLivArea  GarageCars  GarageArea TotalBsmtSF 
##   1.0000000   0.7909816   0.7086245   0.6404092   0.6234314   0.6135806

#check skewness of X skewed at right
#Right-skewness (positive) indicate that mean greater than median
suppressWarnings(suppressMessages(library(e1071)))
skewness(housing_data$OverallQual)

## [1] 0.2164984

skewness(housing_data$GrLivArea)

## [1] 1.363754

#let x be housing_data$GrLivArea
#histogram :check distribution of x and skewed to the right
hist(housing_data$GrLivArea)

X <- housing_data$GrLivArea

Probability

Calculate as a minimum the below probabilities a through c. Assume the small letter “x” is estimated as the 1st quartile of the X variable, and the small letter “y” is estimated as the 1st quartile of the Y variable. Interpret the meaning of all probabilities. In addition, make a table of counts as shown below.

Answer: First quartile Q1: is the median of the lower half of the data set. Another word is 25% of the numbers in the data set lie below Q1 and 75% lie above Q1.

#first quantile of X and Y
x <- unname(quantile(X, 0.25))
y <- unname(quantile(Y, 0.25))

#subset Y>y
Y_gt_y <- subset(housing_data [housing_data$SalePrice > y,], select=c("GrLivArea", "SalePrice"))
#subset X>x in Y>y
X_gt_x_subset <- subset(Y_gt_y [Y_gt_y$GrLivArea > x,], select=c("GrLivArea", "SalePrice"))


#a.P(X>x | Y>y)
nrow(X_gt_x_subset[1])/nrow(Y_gt_y[2])

## [1] 0.8712329

#b. P(X>x, Y>y)
nrow(housing_data[(housing_data$SalePrice >y) & (housing_data$GrLivArea >x), ])/nrow(housing_data)

## [1] 0.6534247

#c.P(X<x | Y>y)
1- nrow(X_gt_x_subset[1])/nrow(Y_gt_y[2])

## [1] 0.1287671

#find count in each condition in the table
#X<x & Y<y
lt_xy<- nrow(housing_data[(housing_data$SalePrice <y) & (housing_data$GrLivArea <x), ])

#X>x & Y<y
gt_x_lt_y <- nrow(housing_data[(housing_data$SalePrice <y) & (housing_data$GrLivArea >x), ])

#X<x & Y>y
lt_x_gt_y <-nrow(housing_data[(housing_data$SalePrice >y) & (housing_data$GrLivArea <x), ])

#X>x & Y>y
gt_xy <- nrow(housing_data[(housing_data$SalePrice >y) & (housing_data$GrLivArea >x), ])


#Creat and fill the table
corssTable <- matrix(c(lt_xy,lt_x_gt_y,lt_xy +lt_x_gt_y,gt_x_lt_y,gt_xy,gt_x_lt_y+gt_xy,lt_xy+gt_x_lt_y,lt_x_gt_y+gt_xy, lt_xy+gt_x_lt_y+lt_x_gt_y+gt_xy ),ncol=3,byrow=TRUE)
colnames(corssTable) <- c("Y < y","Y > y","Total")
rownames(corssTable) <- c("X < x","X > x","Total")
corssTable <- as.table(corssTable)
corssTable

##       Y < y Y > y Total
## X < x   224   141   365
## X > x   141   954  1095
## Total   365  1095  1460

Does splitting the training data in this fashion make them independent? Let A be the new variable counting those observations above the 1st quartile for X, and let B be the new variable counting those observations above the 1st quartile for Y. Does P(AB)=P(A)P(B)? Check mathematically, and then evaluate by running a Chi Square test for association.

Answer: A is 1095, B is 1095, P(AB) is 0.6534247 and P(A)*P(B)=0.75, P(AB) is not equal to P(A)P(B), which splitting the training data in this fashion make them not independent.\(\\\)

The result of the Chi-square test is that p value approachs to zero and reject H0, which means row and column variables are statistical significantly associated.

Chi-square test examines here whether rows and columns of the contingency table are statistically significantly associated.\(\\\) Null hypothesis (H0): the row and the column variables are independent.\(\\\) Alternative hypothesis (H1): row and column variables are dependent\(\\\) The expected value is calculated as follow: \(e=\frac { row.sum\quad *\quad col.sum }{ grand.total } \\\)

The Chi-square statistic is calculated as follow: \({ \chi }^{ 2 }=\sum { \frac { { (o\quad -e) }^{ 2 } }{ e } } ,\quad o\quad is\quad the\quad observed\quad value,\quad e\quad is\quad the\quad expected\quad value\)

A <- nrow(housing_data [housing_data$GrLivArea > x,])
A

## [1] 1095

B <- nrow(housing_data [housing_data$SalePrice >y,])
B

## [1] 1095

AB <- nrow(housing_data[(housing_data$SalePrice >y) & (housing_data$GrLivArea >x), ])
AB

## [1] 954

#P(AB)
AB/nrow(housing_data)

## [1] 0.6534247

#P(A)
A/nrow(housing_data)

## [1] 0.75

#P(B)
B/nrow(housing_data)

## [1] 0.75

#P(A)*P(B)
(A/nrow(housing_data)) *(B/nrow(housing_data))

## [1] 0.5625

#chi-square test of independence
#Result is reject H0 which row and colunm varialbes are independent. 
df1 <- subset(housing_data, select=c("GrLivArea", "SalePrice"))
chisq.test(df1)

## 
##  Pearson's Chi-squared test
## 
## data:  df1
## X-squared = 200960, df = 1459, p-value < 2.2e-16

Descriptive and Inferential Statistics

Provide univariate descriptive statistics and appropriate plots for the training data set. Provide a scatterplot of X and Y. Derive a correlation matrix for any THREE quantitative variables in the dataset. Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide a 92% confidence interval. Discuss the meaning of your analysis. Would you be worried about familywise error? Why or why not?

Answer: GrLivArea (X) and SalePrice (Y) is not linearity relating base on the scatterplot.

Correlation formula: v1 and v2 are two vectors of length n m1 and m2 corresponds to the means of v1 and v2 \(\\r\quad =\quad \frac { \sum { (v1-m1)(v2-m2) } }{ \sqrt { \sum { { (v1-m1) }^{ 2 } } \sum { { (v2-m2) }^{ 2 } } } } \\\)

Hypotheses test: t test with n-2 degrees of freedom, where n is the number of observation in v1 and v2 variables. \(\\t\quad =\quad \frac { r }{ \sqrt { 1-{ r }^{ 2 } } } \sqrt { n-2 } \\\)

#scatterplot of X and Y
plot(X, Y, main="Housing Data", 
    xlab="GrLivArea", ylab="Sale Price", pch=19)
abline(lm(Y~X), col="red") # regression line (Y~X) 
lines(lowess(X,Y), col="blue") # lowess line (X,Y)

#Derive a correlation matrix for any THREE quantitative variables in the dataset
df2<- subset(housing_data, select=c("GrLivArea", "GarageCars", "OverallQual")) # get data 
cor(df2)

##             GrLivArea GarageCars OverallQual
## GrLivArea   1.0000000  0.4672474   0.5930074
## GarageCars  0.4672474  1.0000000   0.6006707
## OverallQual 0.5930074  0.6006707   1.0000000

suppressWarnings(suppressMessages(library(gclus)))
df2.r <- abs(cor(df2)) # get correlations
df2.col <- dmat.color(df2.r) # get colors
# reorder variables so those with highest correlation
# are closest to the diagonal
df2.o <- order.single(df2.r) 
cpairs(df2, df2.o, panel.colors=df2.col, gap=.5,
main="GrLivArea, GarageCars and OverallQual Correlation" )

#Hypotheses test in 92% confidence interval 
#H0: correlations between GrLivArea and GarageCars set of variables is 0. 
#H1: correlations between GrLivArea and GarageCars set of variables isn't 0.
cor.test(~ GrLivArea+GarageCars, data = df2,conf.level=0.92 )

## 
##  Pearson's product-moment correlation
## 
## data:  GrLivArea and GarageCars
## t = 20.18, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 92 percent confidence interval:
##  0.4306370 0.5023227
## sample estimates:
##       cor 
## 0.4672474

#Hypotheses test in 92% confidence interval
#H0: correlations between GarageCars and OverallQual set of variables is 0. 
#H1: correlations between GarageCars and OverallQual set of variables isn't 0.
cor.test(~ GarageCars+OverallQual, data = df2,conf.level=0.92 )

## 
##  Pearson's product-moment correlation
## 
## data:  GarageCars and OverallQual
## t = 28.688, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 92 percent confidence interval:
##  0.5705454 0.6291817
## sample estimates:
##       cor 
## 0.6006707

#Hypotheses test in 92% confidence interval
#H0: correlations between GrLivArea and OverallQual set of variables is 0. 
#H1: correlations between GrLivArea and OverallQual set of variables isn't 0.
cor.test(~ GrLivArea+OverallQual, data = df2,conf.level=0.92 )

## 
##  Pearson's product-moment correlation
## 
## data:  GrLivArea and OverallQual
## t = 28.121, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 92 percent confidence interval:
##  0.5624621 0.6219364
## sample estimates:
##       cor 
## 0.5930074

Linear Algebra and Correlation

Invert your 3 x 3 correlation matrix from above. (This is known as the precision matrix and contains variance inflation factors on the diagonal.) Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix. Conduct LU decomposition on the matrix.

#precision matrix: Invert corr matrix
round(solve(cor(df2)),4)

##             GrLivArea GarageCars OverallQual
## GrLivArea      1.5897    -0.2762     -0.7768
## GarageCars    -0.2762     1.6124     -0.8048
## OverallQual   -0.7768    -0.8048      1.9441

suppressWarnings(suppressMessages(library(Matrix)))
round(cor(df2) %*% solve(cor(df2)),4)

##             GrLivArea GarageCars OverallQual
## GrLivArea           1          0           0
## GarageCars          0          1           0
## OverallQual         0          0           1

round(solve(cor(df2)) %*% cor(df2),4)

##             GrLivArea GarageCars OverallQual
## GrLivArea           1          0           0
## GarageCars          0          1           0
## OverallQual         0          0           1

I write a R factorize function for nxn matrix A into LU.

#check input error
checkinput <- function(v,num.dimantion){
  total.element =num.dimantion^2
  sum=0
  for(i in 1: num.dimantion){
      sum=sum+abs(v[1+(i-1)*num.dimantion])
  }
  
  if(length(v) != total.element ){
    print ("It's not a square matrix. Place check your input numbers.")
    return (-1)
  }else if(sum == 0){
      print("The first column is zero vector.")
      return (-1)
  }else {
    #print ("square")
    return (0)
  }
}

#check pivor numvber of each row and swap next nonzero row to current row  while pivor number is zero
checkPivorSwap <- function (M,jcol, num.dimantion) {
    if (M[jcol,jcol] == 0) {
      temp=M[jcol, ]
    
      for(irow in (jcol+1):num.dimantion){
        if(M[irow,jcol] != 0){
          M[jcol, ]= M[irow, ]
          M[irow, ]= temp
          return(M)
        }
      }
    }
  return(M)
}

LUfunction<- function (a,num.dimantion){
  #check input error
  inputError = checkinput(a,num.dimantion)
  if ( inputError == 0 ) {
    
    #build original square matrix
    A = matrix(a,nrow=num.dimantion,ncol=num.dimantion, byrow=TRUE)
    
    #let lowerMatrix and uperMatrix be an n square matrix to store all factors:L and U
    lowerMatrix = diag(num.dimantion)
    uperMatrix = A
    
    jcol=1
    while(jcol < num.dimantion) {
      uperMatrix = checkPivorSwap(uperMatrix,jcol,num.dimantion)
      for(irow in (jcol+1):num.dimantion){

        #built L matrix
        if(uperMatrix[jcol, jcol] != 0) {
          factorL= ((uperMatrix[irow, jcol]/uperMatrix[jcol, jcol]))
        }else{
          factorL=0
        }
        lowerMatrix[irow, jcol] = factorL
        
        #builr U matrix 
        uperMatrix[irow,] = ((-1)*(factorL * uperMatrix[jcol, ]) + uperMatrix[irow, ])
      }
      jcol = jcol +1
    }
    print("L:lowerMatrix")
    print(lowerMatrix)
    print("U:uperMatrix")
    print(uperMatrix)
    print("LU")
    print(lowerMatrix%*%uperMatrix)
    print("A")
    print(A)
  }
}

corrMatrix <-c(1.5897,-0.2762,-0.7768,-0.2762,1.6124,-0.8048,-0.7768,-0.8048,1.9441)
LUfunction(corrMatrix,3)

## [1] "L:lowerMatrix"
##            [,1]       [,2] [,3]
## [1,]  1.0000000  0.0000000    0
## [2,] -0.1737435  1.0000000    0
## [3,] -0.4886457 -0.6007138    1
## [1] "U:uperMatrix"
##        [,1]          [,2]       [,3]
## [1,] 1.5897 -2.762000e-01 -0.7768000
## [2,] 0.0000  1.564412e+00 -0.9397639
## [3,] 0.0000  1.110223e-16  0.9999909
## [1] "LU"
##         [,1]    [,2]    [,3]
## [1,]  1.5897 -0.2762 -0.7768
## [2,] -0.2762  1.6124 -0.8048
## [3,] -0.7768 -0.8048  1.9441
## [1] "A"
##         [,1]    [,2]    [,3]
## [1,]  1.5897 -0.2762 -0.7768
## [2,] -0.2762  1.6124 -0.8048
## [3,] -0.7768 -0.8048  1.9441

Calculus-Based Probability & Statistics

Many times, it makes sense to fit a closed form distribution to data. For the first variable that you selected which is skewed to the right, shift it so that the minimum value is above zero as necessary. Then load the MASS package and run fitdistr to fit an exponential probability density function. (See https://stat.ethz.ch/R-manual/R-devel/library/MASS/html/fitdistr.html ). Find the optimal value of ?? for this distribution, and then take 1000 samples from this exponential distribution using this value (e.g., rexp(1000, ??)). Plot a histogram and compare it with a histogram of your original variable. Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function (CDF). Also generate a 95% confidence interval from the empirical data, assuming normality. Finally, provide the empirical 5th percentile and 95th percentile of the data. Discuss.

Answer:

\(\lambda\) in an exponentially distributed random variable X is 1 over mean of X.Variance of X is 1 over \(\lambda^2\). So \(\lambda\) is biger and variance is smaller. By following this properties of exponential distribution, I will simmulat \(\lambda\) from random 1000 sample sales of X by using rexp(1000, ??) in R.

#shift right skewness data 
suppressWarnings(suppressMessages(library(e1071)))
hist(X, col = "tomato")

# normal fit 
qqnorm(X); qqline(X)

#fitting an exponential probability density function
suppressWarnings(suppressMessages(library(MASS)))
rate_X<- fitdistr(X, "exponential")

#exponential distribution
mu <- mean(X) #mean of G
lambda <-1/mu #rate

#generate 1000 sample size random exponentials and distribution
set.seed(2802)
hist(rexp(1000, lambda), main="Distribution of 1000 random exponential distribution", col="lightblue")

Modeling

Build some type of multiple regression model and submit your model to the competition board. Provide your complete model summary and results with analysis. Report your Kaggle.com user name and score.

step1: choose top five highese correlated variables for sales price Y prediction Step2: fitting the multipule regression model Step3: evaluation of the model - residual variance and R-Square Step4: testing the model by test set

mulRegess <- lm(SalePrice~OverallQual+GrLivArea+GarageCars+GarageArea +TotalBsmtSF, data = housing_data)

summary(mulRegess)

## 
## Call:
## lm(formula = SalePrice ~ OverallQual + GrLivArea + GarageCars + 
##     GarageArea + TotalBsmtSF, data = housing_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -478977  -19915   -1503   16701  287132 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -99072.050   4638.450 -21.359  < 2e-16 ***
## OverallQual  23635.007   1072.532  22.037  < 2e-16 ***
## GrLivArea       45.346      2.489  18.218  < 2e-16 ***
## GarageCars   14544.315   3022.681   4.812 1.65e-06 ***
## GarageArea      17.133     10.468   1.637    0.102    
## TotalBsmtSF     31.501      2.904  10.848  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 38900 on 1454 degrees of freedom
## Multiple R-squared:  0.7611, Adjusted R-squared:  0.7603 
## F-statistic: 926.5 on 5 and 1454 DF,  p-value: < 2.2e-16

Since p-value of GarageArea is greater than 0.05, I drop this variable from the model.

mulRegess <- lm(SalePrice~OverallQual+GrLivArea+GarageCars+TotalBsmtSF, data = housing_data)

summary(mulRegess)

## 
## Call:
## lm(formula = SalePrice ~ OverallQual + GrLivArea + GarageCars + 
##     TotalBsmtSF, data = housing_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -469856  -19956   -1360   17200  286304 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -99248.853   4639.866  -21.39   <2e-16 ***
## OverallQual  23572.236   1072.465   21.98   <2e-16 ***
## GrLivArea       45.643      2.484   18.38   <2e-16 ***
## GarageCars   18582.209   1747.412   10.63   <2e-16 ***
## TotalBsmtSF     32.520      2.838   11.46   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 38920 on 1455 degrees of freedom
## Multiple R-squared:  0.7607, Adjusted R-squared:   0.76 
## F-statistic:  1156 on 4 and 1455 DF,  p-value: < 2.2e-16

est_price <- function (oq,ga,gc,bs) {
   if (is.na(oq)) { oq = 0 }
   if (is.na(ga)) { ga = 0 }
   if (is.na(gc)) { gc = 0 }
   if (is.na(bs)) { bs = 0 }
   return ((-99248.853 + 23572.236*oq + 45.643*ga + 18582.209*gc + 32.520*bs))
}

test <- read.csv(file="C:/Users/Ada/Desktop/CUNY_SPS_DA/605_02_Comp_Math_R/605 final- housing data fr kaggle/test.csv", header=TRUE, sep=",")

test$SalePrice <- test$Id

for(row in 1:nrow(test)) {
  test[row,]$SalePrice <-  est_price( test[row,]$OverallQual, test[row,]$GrLivArea, test[row,]$GarageCars, test[row,]$TotalBsmtSF)
}

predict_price <- subset(test, select=c("Id", "SalePrice"))

write.csv(predict_price, "C:/Users/Ada/Desktop/CUNY_SPS_DA/605_02_Comp_Math_R/605 final- housing data fr kaggle/predict_saleprice.csv")

The predict price file has been submitted here, 0.62973. https://www.kaggle.com/c/house-prices-advanced-regression-techniques/leaderboard