Data605 Final project

Problem 1.

Using R, generate a random variable X that has 10,000 random uniform numbers from 1 to N, where N can be any number of your choosing greater than or equal to 6. Then generate a random variable Y that has 10,000 random normal numbers with a mean of μ=σ=(N+1)/2.

set.seed(123)
N <- 6
n <- 10000
mu <- sigma <- (N + 1)/2
df <- data.frame(X = runif(n, min = 1, max = N), Y = rnorm(n, mean = mu, sd = sigma))
summary(df$X)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   2.264   3.473   3.488   4.717   6.000

summary(df$Y)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -9.959   1.171   3.484   3.507   5.937  16.967

Probability.

Calculate as a minimum the below probabilities a through c. Assume the small letter “x” is estimated as the median of the X variable, and the small letter “y” is estimated as the 1st quartile of the Y variable. Interpret the meaning of all probabilities.

x <- median(df$X)
y <- quantile(df$Y, 0.25)

P(X>x | X>y)

total <- nrow(df)
a <- nrow(subset(df, X>y))/total
a1 <- nrow(subset(df, X>x & Y>y))/total
a1/a

## [1] 0.3895861

P(X>x, Y>y)

b <- nrow(subset(df, X>x& Y>y))/total
b

## [1] 0.3756

P(X<x | X>y)

c <- nrow(subset(df, X<x& X>y))/total
c

## [1] 0.4641

P(X>x and Y>y)=P(X>x)P(Y>y)

Investigate whether P(X>x and Y>y)=P(X>x)P(Y>y) by building a table and evaluating the marginal and joint probabilities.

pxgx <- nrow(subset(df, X>x))/total
pygy <- nrow(subset(df, Y>y))/total
pxgxygy<- nrow(subset(df, X>x & Y>y))/total

pxpy <- pxgx *pygy

round(pxgxygy, 2)

## [1] 0.38

round(pxpy, 2)

## [1] 0.38

Answer: P(X>x and Y>y)=P(X>x)P(Y>y) is true

Table

table <- cbind(pxgx, pygy, pxpy, pxpy)
colnames(table) <- c( "P(X>x)", "P(Y>y)", "P(X>x)P(Y>y)", "P(X>x & Y>y)")
table

##      P(X>x) P(Y>y) P(X>x)P(Y>y) P(X>x & Y>y)
## [1,]    0.5   0.75        0.375        0.375

a1 <- subset(df, X > x)
b1 <- subset(df, Y > y)
le_x <- subset(df, X <= x)
le_y <- subset(df, Y <= y)
table <- matrix(c(nrow(a1), nrow(b1), nrow(le_x), nrow(le_y)), 2, 2,
                dimnames = list(c("x", "y"),
                                c("X > x, Y > y", "X <= x, Y <= y")))
table

##   X > x, Y > y X <= x, Y <= y
## x         5000           5000
## y         7500           2500

Fisher’s exact/Chi square Test

H0 : X and Y are independent HA : Y and Y are not independent Fisher’s exact test

fisher.test(table)

## 
##  Fisher's Exact Test for Count Data
## 
## data:  table
## p-value < 2.2e-16
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.3137986 0.3540659
## sample estimates:
## odds ratio 
##  0.3333333

Chi squre test

chisq.test(table)

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table
## X-squared = 1332.3, df = 1, p-value < 2.2e-16

P-value is less than 0.05. The null hypothesis is rejected. Therefore, X and Y are not indepedent.

Problem 2.

You are to register for Kaggle.com (free) and compete in the House Prices: Advanced Regression Techniques competition. https://www.kaggle.com/c/house-prices-advanced-regression-techniques . I want you to do the following.

Descriptive and Inferential Statistics

train <- read.csv("https://raw.githubusercontent.com/ekhahm/datascience/master/Data605/train.csv")
head(train)

##   Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape
## 1  1         60       RL          65    8450   Pave  <NA>      Reg
## 2  2         20       RL          80    9600   Pave  <NA>      Reg
## 3  3         60       RL          68   11250   Pave  <NA>      IR1
## 4  4         70       RL          60    9550   Pave  <NA>      IR1
## 5  5         60       RL          84   14260   Pave  <NA>      IR1
## 6  6         50       RL          85   14115   Pave  <NA>      IR1
##   LandContour Utilities LotConfig LandSlope Neighborhood Condition1
## 1         Lvl    AllPub    Inside       Gtl      CollgCr       Norm
## 2         Lvl    AllPub       FR2       Gtl      Veenker      Feedr
## 3         Lvl    AllPub    Inside       Gtl      CollgCr       Norm
## 4         Lvl    AllPub    Corner       Gtl      Crawfor       Norm
## 5         Lvl    AllPub       FR2       Gtl      NoRidge       Norm
## 6         Lvl    AllPub    Inside       Gtl      Mitchel       Norm
##   Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt
## 1       Norm     1Fam     2Story           7           5      2003
## 2       Norm     1Fam     1Story           6           8      1976
## 3       Norm     1Fam     2Story           7           5      2001
## 4       Norm     1Fam     2Story           7           5      1915
## 5       Norm     1Fam     2Story           8           5      2000
## 6       Norm     1Fam     1.5Fin           5           5      1993
##   YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType
## 1         2003     Gable  CompShg     VinylSd     VinylSd    BrkFace
## 2         1976     Gable  CompShg     MetalSd     MetalSd       None
## 3         2002     Gable  CompShg     VinylSd     VinylSd    BrkFace
## 4         1970     Gable  CompShg     Wd Sdng     Wd Shng       None
## 5         2000     Gable  CompShg     VinylSd     VinylSd    BrkFace
## 6         1995     Gable  CompShg     VinylSd     VinylSd       None
##   MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure
## 1        196        Gd        TA      PConc       Gd       TA           No
## 2          0        TA        TA     CBlock       Gd       TA           Gd
## 3        162        Gd        TA      PConc       Gd       TA           Mn
## 4          0        TA        TA     BrkTil       TA       Gd           No
## 5        350        Gd        TA      PConc       Gd       TA           Av
## 6          0        TA        TA       Wood       Gd       TA           No
##   BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF
## 1          GLQ        706          Unf          0       150         856
## 2          ALQ        978          Unf          0       284        1262
## 3          GLQ        486          Unf          0       434         920
## 4          ALQ        216          Unf          0       540         756
## 5          GLQ        655          Unf          0       490        1145
## 6          GLQ        732          Unf          0        64         796
##   Heating HeatingQC CentralAir Electrical X1stFlrSF X2ndFlrSF LowQualFinSF
## 1    GasA        Ex          Y      SBrkr       856       854            0
## 2    GasA        Ex          Y      SBrkr      1262         0            0
## 3    GasA        Ex          Y      SBrkr       920       866            0
## 4    GasA        Gd          Y      SBrkr       961       756            0
## 5    GasA        Ex          Y      SBrkr      1145      1053            0
## 6    GasA        Ex          Y      SBrkr       796       566            0
##   GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr
## 1      1710            1            0        2        1            3
## 2      1262            0            1        2        0            3
## 3      1786            1            0        2        1            3
## 4      1717            1            0        1        0            3
## 5      2198            1            0        2        1            4
## 6      1362            1            0        1        1            1
##   KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu
## 1            1          Gd            8        Typ          0        <NA>
## 2            1          TA            6        Typ          1          TA
## 3            1          Gd            6        Typ          1          TA
## 4            1          Gd            7        Typ          1          Gd
## 5            1          Gd            9        Typ          1          TA
## 6            1          TA            5        Typ          0        <NA>
##   GarageType GarageYrBlt GarageFinish GarageCars GarageArea GarageQual
## 1     Attchd        2003          RFn          2        548         TA
## 2     Attchd        1976          RFn          2        460         TA
## 3     Attchd        2001          RFn          2        608         TA
## 4     Detchd        1998          Unf          3        642         TA
## 5     Attchd        2000          RFn          3        836         TA
## 6     Attchd        1993          Unf          2        480         TA
##   GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch X3SsnPorch
## 1         TA          Y          0          61             0          0
## 2         TA          Y        298           0             0          0
## 3         TA          Y          0          42             0          0
## 4         TA          Y          0          35           272          0
## 5         TA          Y        192          84             0          0
## 6         TA          Y         40          30             0        320
##   ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold
## 1           0        0   <NA>  <NA>        <NA>       0      2   2008
## 2           0        0   <NA>  <NA>        <NA>       0      5   2007
## 3           0        0   <NA>  <NA>        <NA>       0      9   2008
## 4           0        0   <NA>  <NA>        <NA>       0      2   2006
## 5           0        0   <NA>  <NA>        <NA>       0     12   2008
## 6           0        0   <NA> MnPrv        Shed     700     10   2009
##   SaleType SaleCondition SalePrice
## 1       WD        Normal    208500
## 2       WD        Normal    181500
## 3       WD        Normal    223500
## 4       WD       Abnorml    140000
## 5       WD        Normal    250000
## 6       WD        Normal    143000

1. descriptive statistics

Provide univariate descriptive statistics and appropriate plots for the training data set.

Variable 1:Overall Quality

summary(train$OverallQual)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   5.000   6.000   6.099   7.000  10.000

par(mfrow=c(1,2))
boxplot(train$OverallQual, main="Boxplot of Overall Quality")
hist(train$OverallQual, main= "Histogram of Overall Quality", col= "blue")

Independent variable 2 : Year built

summary(train$YearBuilt)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1872    1954    1973    1971    2000    2010

par(mfrow=c(1,2))
boxplot(train$YearBuilt, main="Boxplot of Year built")
hist(train$YearBuilt, main= "Histogram of Year Built", col= "red")

Dependent variable : Sale price

summary(train$SalePrice)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   34900  129975  163000  180921  214000  755000

par(mfrow=c(1,2))
boxplot(train$SalePrice, main="Boxplot of sale price")
hist(train$SalePrice, main= "Histogram of sale price", col= "red")

####2. Scatterplot

plot(train$OverallQual, train$SalePrice, xlab= "Overall quality", ylab="Sale price", main= "scatterplot of Overall Quality vs. Sale Price")

plot(train$YearBuilt, train$SalePrice, xlab= "Year Built", ylab="Sale price", main= "scatterplot of Year Built vs. Sale Price")

3. Corrleation matrix

corDF <- train[c("SalePrice", "OverallQual", "YearBuilt")]
corMatrix <- cor(corDF, use = "complete.obs")
print(corMatrix)

##             SalePrice OverallQual YearBuilt
## SalePrice   1.0000000   0.7909816 0.5228973
## OverallQual 0.7909816   1.0000000 0.5723228
## YearBuilt   0.5228973   0.5723228 1.0000000

4. Test hypothesis

cor.test(train$OverallQual, train$SalePrice, conf.level = 0.8)

## 
##  Pearson's product-moment correlation
## 
## data:  train$OverallQual and train$SalePrice
## t = 49.364, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.7780752 0.8032204
## sample estimates:
##       cor 
## 0.7909816

The correlation value is 0.79. It indicates there is a correlation between overall quality and sale price. P-value is less than 0.05. It can be concluded that two varialbes are correlated.

cor.test(train$YearBuilt, train$SalePrice, conf.level = 0.8)

## 
##  Pearson's product-moment correlation
## 
## data:  train$YearBuilt and train$SalePrice
## t = 23.424, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.4980766 0.5468619
## sample estimates:
##       cor 
## 0.5228973

The correlation value is 0.522. It indeicates there is moderate correlation between Year built and sale price.

Linear algebra and Correlation

Invert your correlation matrix from above. (This is known as the precision matrix and contains variance inflation factors on the diagonal.) Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix. Conduct LU decomposition on the matrix.

Precision matrix

p_matrix <- solve(corMatrix)
p_matrix

##              SalePrice OverallQual  YearBuilt
## SalePrice    2.7246511  -1.9923563 -0.2844419
## OverallQual -1.9923563   2.9439846 -0.6431116
## YearBuilt   -0.2844419  -0.6431116  1.5168013

Multiply the correlation matrix by the precision matrix

multiply <- round(corMatrix %*% p_matrix)
multiply

##             SalePrice OverallQual YearBuilt
## SalePrice           1           0         0
## OverallQual         0           1         0
## YearBuilt           0           0         1

Multiply the precision matrix by the correlation matrix

multiply1 <- round(p_matrix %*% corMatrix)
multiply1

##             SalePrice OverallQual YearBuilt
## SalePrice           1           0         0
## OverallQual         0           1         0
## YearBuilt           0           0         1

LU decomposition on the matrix.

decomposition <- lu.decomposition(corMatrix)
decomposition

## $L
##           [,1]     [,2] [,3]
## [1,] 1.0000000 0.000000    0
## [2,] 0.7909816 1.000000    0
## [3,] 0.5228973 0.423992    1
## 
## $U
##      [,1]      [,2]      [,3]
## [1,]    1 0.7909816 0.5228973
## [2,]    0 0.3743481 0.1587206
## [3,]    0 0.0000000 0.6592821

corMatrix

##             SalePrice OverallQual YearBuilt
## SalePrice   1.0000000   0.7909816 0.5228973
## OverallQual 0.7909816   1.0000000 0.5723228
## YearBuilt   0.5228973   0.5723228 1.0000000

Calculus Based Probaility & Statistics

X <- train$OverallQual[!is.na(train$OverallQual)]
fd <- fitdistr(X, "gamma")

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

hist(X, breaks=40, prob=TRUE, xlab="OverallQual",
     main="Overall Quality")
curve(dgamma(x, shape = fd$estimate['shape'], rate = fd$estimate['rate']), 
      col="blue", add=TRUE)

Percentile exponential pdf - 5th and 95th percentiles

epb<- fitdistr(train$OverallQual, "exponential")
lambda <- epb$estimate
lambda

##      rate 
## 0.1639528

qexp(0.05,rate= lambda)

## [1] 0.312854

qexp(0.95, rate= lambda)

## [1] 18.27191

Confidential interval

CI(na.exclude(train$OverallQual), ci= 0.95)

##    upper     mean    lower 
## 6.170314 6.099315 6.028316

Modeling

model = train[, which(sapply(train, function(x) sum(is.na(x))) == 0)]
ft = lm(train$SalePrice ~ train$OverallQual + train$YearBuilt + train$LotFrontage + train$GarageArea, data = train)
summary(ft)

## 
## Call:
## lm(formula = train$SalePrice ~ train$OverallQual + train$YearBuilt + 
##     train$LotFrontage + train$GarageArea, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -338703  -26925   -4312   17264  388173 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -416601.20  101491.58  -4.105 4.32e-05 ***
## train$OverallQual   36512.68    1275.21  28.633  < 2e-16 ***
## train$YearBuilt       157.19      53.37   2.945  0.00329 ** 
## train$LotFrontage     411.21      58.54   7.025 3.59e-12 ***
## train$GarageArea       74.63       7.88   9.471  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 45900 on 1196 degrees of freedom
##   (259 observations deleted due to missingness)
## Multiple R-squared:  0.698,  Adjusted R-squared:  0.697 
## F-statistic:   691 on 4 and 1196 DF,  p-value: < 2.2e-16