library(tidyr)
library(matrixcalc)
library(corrplot)
## corrplot 0.84 loaded
library(MASS)
N <- 10
n <- 10000
X <- runif(n = n, min = 1, max = N)
Y <- rnorm(n, mean=(N+1)/2, sd = (N+1)/2)
x <- median(X)
y <- quantile(Y)[2]
df <- data.frame(X, Y)
a. P(X>x | X>y)
Identify first the Probability of X>y
dfXy <- subset(df, df$X>y)
Identify the Probability of X>x in the above subset
n2 <- nrow(subset(dfXy, df$X>x))
n2/n
## [1] 0.5
b. P(X>x, Y>y)
n2 <- nrow(subset(df, df$X>x & df$Y>y))
Pxny <- n2/n
Pxny
## [1] 0.3776
c. P(X<x | X>y)
n2 <- nrow(subset(dfXy, df$X<x))
n2/n
## [1] 0.5
Px <- nrow(subset(df, df$X>x))/n
Py <- nrow(subset(df, df$Y>y))/n
Px*Py
## [1] 0.375
dfPtableValues <- matrix(c( nrow( df %>% subset(df$X < x & df$Y < y) ),
nrow( df %>% subset(df$X > x & df$Y < y)),
nrow( df %>% subset(df$X < x & df$Y > y)),
nrow( df %>% subset(df$X > x & df$Y > y))
))
table <- matrix(dfPtableValues , nrow=2)
colnames(table) <- c('Y < y','Y > y')
rownames(table) <- c('X < x','X > x')
table
## Y < y Y > y
## X < x 1276 3724
## X > x 1224 3776
dfPtable <- c( nrow( df %>% subset(df$X < x & df$Y < y) )/n,
nrow( df %>% subset(df$X > x & df$Y < y))/n,
nrow( df %>% subset(df$X < x & df$Y > y))/n,
nrow( df %>% subset(df$X > x & df$Y > y))/n
)
table <- matrix(dfPtable , nrow=2)
colnames(table) <- c('Y < y','Y > y')
rownames(table) <- c('X < x','X > x')
table
## Y < y Y > y
## X < x 0.1276 0.3724
## X > x 0.1224 0.3776
Looking at the individual values, marginal and joint probabilities are almost similar i.e., 0.375 vs 0.3776
Fisher’s test is used
pTable <- matrix(dfPtableValues , nrow=2)
fisher.test(pTable)
##
## Fisher's Exact Test for Count Data
##
## data: pTable
## p-value = 0.2389
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 0.964496 1.158476
## sample estimates:
## odds ratio
## 1.057068
Chi Square test is used
chisq.test(pTable)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: pTable
## X-squared = 1.3872, df = 1, p-value = 0.2389
Since both tests return higher p value, they fail to reject the null hypothesis.
#Load the training dataset
trainData <- read.csv("kaggle/train.csv")
#Show summary of the dataset
summary(trainData)
## Id MSSubClass MSZoning LotFrontage
## Min. : 1.0 Min. : 20.0 C (all): 10 Min. : 21.00
## 1st Qu.: 365.8 1st Qu.: 20.0 FV : 65 1st Qu.: 59.00
## Median : 730.5 Median : 50.0 RH : 16 Median : 69.00
## Mean : 730.5 Mean : 56.9 RL :1151 Mean : 70.05
## 3rd Qu.:1095.2 3rd Qu.: 70.0 RM : 218 3rd Qu.: 80.00
## Max. :1460.0 Max. :190.0 Max. :313.00
## NA's :259
## LotArea Street Alley LotShape LandContour Utilities
## Min. : 1300 Grvl: 6 Grvl: 50 IR1:484 Bnk: 63 AllPub:1459
## 1st Qu.: 7554 Pave:1454 Pave: 41 IR2: 41 HLS: 50 NoSeWa: 1
## Median : 9478 NA's:1369 IR3: 10 Low: 36
## Mean : 10517 Reg:925 Lvl:1311
## 3rd Qu.: 11602
## Max. :215245
##
## LotConfig LandSlope Neighborhood Condition1 Condition2
## Corner : 263 Gtl:1382 NAmes :225 Norm :1260 Norm :1445
## CulDSac: 94 Mod: 65 CollgCr:150 Feedr : 81 Feedr : 6
## FR2 : 47 Sev: 13 OldTown:113 Artery : 48 Artery : 2
## FR3 : 4 Edwards:100 RRAn : 26 PosN : 2
## Inside :1052 Somerst: 86 PosN : 19 RRNn : 2
## Gilbert: 79 RRAe : 11 PosA : 1
## (Other):707 (Other): 15 (Other): 2
## BldgType HouseStyle OverallQual OverallCond YearBuilt
## 1Fam :1220 1Story :726 Min. : 1.000 Min. :1.000 Min. :1872
## 2fmCon: 31 2Story :445 1st Qu.: 5.000 1st Qu.:5.000 1st Qu.:1954
## Duplex: 52 1.5Fin :154 Median : 6.000 Median :5.000 Median :1973
## Twnhs : 43 SLvl : 65 Mean : 6.099 Mean :5.575 Mean :1971
## TwnhsE: 114 SFoyer : 37 3rd Qu.: 7.000 3rd Qu.:6.000 3rd Qu.:2000
## 1.5Unf : 14 Max. :10.000 Max. :9.000 Max. :2010
## (Other): 19
## YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd
## Min. :1950 Flat : 13 CompShg:1434 VinylSd:515 VinylSd:504
## 1st Qu.:1967 Gable :1141 Tar&Grv: 11 HdBoard:222 MetalSd:214
## Median :1994 Gambrel: 11 WdShngl: 6 MetalSd:220 HdBoard:207
## Mean :1985 Hip : 286 WdShake: 5 Wd Sdng:206 Wd Sdng:197
## 3rd Qu.:2004 Mansard: 7 ClyTile: 1 Plywood:108 Plywood:142
## Max. :2010 Shed : 2 Membran: 1 CemntBd: 61 CmentBd: 60
## (Other): 2 (Other):128 (Other):136
## MasVnrType MasVnrArea ExterQual ExterCond Foundation BsmtQual
## BrkCmn : 15 Min. : 0.0 Ex: 52 Ex: 3 BrkTil:146 Ex :121
## BrkFace:445 1st Qu.: 0.0 Fa: 14 Fa: 28 CBlock:634 Fa : 35
## None :864 Median : 0.0 Gd:488 Gd: 146 PConc :647 Gd :618
## Stone :128 Mean : 103.7 TA:906 Po: 1 Slab : 24 TA :649
## NA's : 8 3rd Qu.: 166.0 TA:1282 Stone : 6 NA's: 37
## Max. :1600.0 Wood : 3
## NA's :8
## BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2
## Fa : 45 Av :221 ALQ :220 Min. : 0.0 ALQ : 19
## Gd : 65 Gd :134 BLQ :148 1st Qu.: 0.0 BLQ : 33
## Po : 2 Mn :114 GLQ :418 Median : 383.5 GLQ : 14
## TA :1311 No :953 LwQ : 74 Mean : 443.6 LwQ : 46
## NA's: 37 NA's: 38 Rec :133 3rd Qu.: 712.2 Rec : 54
## Unf :430 Max. :5644.0 Unf :1256
## NA's: 37 NA's: 38
## BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating HeatingQC
## Min. : 0.00 Min. : 0.0 Min. : 0.0 Floor: 1 Ex:741
## 1st Qu.: 0.00 1st Qu.: 223.0 1st Qu.: 795.8 GasA :1428 Fa: 49
## Median : 0.00 Median : 477.5 Median : 991.5 GasW : 18 Gd:241
## Mean : 46.55 Mean : 567.2 Mean :1057.4 Grav : 7 Po: 1
## 3rd Qu.: 0.00 3rd Qu.: 808.0 3rd Qu.:1298.2 OthW : 2 TA:428
## Max. :1474.00 Max. :2336.0 Max. :6110.0 Wall : 4
##
## CentralAir Electrical X1stFlrSF X2ndFlrSF LowQualFinSF
## N: 95 FuseA: 94 Min. : 334 Min. : 0 Min. : 0.000
## Y:1365 FuseF: 27 1st Qu.: 882 1st Qu.: 0 1st Qu.: 0.000
## FuseP: 3 Median :1087 Median : 0 Median : 0.000
## Mix : 1 Mean :1163 Mean : 347 Mean : 5.845
## SBrkr:1334 3rd Qu.:1391 3rd Qu.: 728 3rd Qu.: 0.000
## NA's : 1 Max. :4692 Max. :2065 Max. :572.000
##
## GrLivArea BsmtFullBath BsmtHalfBath FullBath
## Min. : 334 Min. :0.0000 Min. :0.00000 Min. :0.000
## 1st Qu.:1130 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:1.000
## Median :1464 Median :0.0000 Median :0.00000 Median :2.000
## Mean :1515 Mean :0.4253 Mean :0.05753 Mean :1.565
## 3rd Qu.:1777 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:2.000
## Max. :5642 Max. :3.0000 Max. :2.00000 Max. :3.000
##
## HalfBath BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd
## Min. :0.0000 Min. :0.000 Min. :0.000 Ex:100 Min. : 2.000
## 1st Qu.:0.0000 1st Qu.:2.000 1st Qu.:1.000 Fa: 39 1st Qu.: 5.000
## Median :0.0000 Median :3.000 Median :1.000 Gd:586 Median : 6.000
## Mean :0.3829 Mean :2.866 Mean :1.047 TA:735 Mean : 6.518
## 3rd Qu.:1.0000 3rd Qu.:3.000 3rd Qu.:1.000 3rd Qu.: 7.000
## Max. :2.0000 Max. :8.000 Max. :3.000 Max. :14.000
##
## Functional Fireplaces FireplaceQu GarageType GarageYrBlt
## Maj1: 14 Min. :0.000 Ex : 24 2Types : 6 Min. :1900
## Maj2: 5 1st Qu.:0.000 Fa : 33 Attchd :870 1st Qu.:1961
## Min1: 31 Median :1.000 Gd :380 Basment: 19 Median :1980
## Min2: 34 Mean :0.613 Po : 20 BuiltIn: 88 Mean :1979
## Mod : 15 3rd Qu.:1.000 TA :313 CarPort: 9 3rd Qu.:2002
## Sev : 1 Max. :3.000 NA's:690 Detchd :387 Max. :2010
## Typ :1360 NA's : 81 NA's :81
## GarageFinish GarageCars GarageArea GarageQual GarageCond
## Fin :352 Min. :0.000 Min. : 0.0 Ex : 3 Ex : 2
## RFn :422 1st Qu.:1.000 1st Qu.: 334.5 Fa : 48 Fa : 35
## Unf :605 Median :2.000 Median : 480.0 Gd : 14 Gd : 9
## NA's: 81 Mean :1.767 Mean : 473.0 Po : 3 Po : 7
## 3rd Qu.:2.000 3rd Qu.: 576.0 TA :1311 TA :1326
## Max. :4.000 Max. :1418.0 NA's: 81 NA's: 81
##
## PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch X3SsnPorch
## N: 90 Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : 0.00
## P: 30 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00
## Y:1340 Median : 0.00 Median : 25.00 Median : 0.00 Median : 0.00
## Mean : 94.24 Mean : 46.66 Mean : 21.95 Mean : 3.41
## 3rd Qu.:168.00 3rd Qu.: 68.00 3rd Qu.: 0.00 3rd Qu.: 0.00
## Max. :857.00 Max. :547.00 Max. :552.00 Max. :508.00
##
## ScreenPorch PoolArea PoolQC Fence MiscFeature
## Min. : 0.00 Min. : 0.000 Ex : 2 GdPrv: 59 Gar2: 2
## 1st Qu.: 0.00 1st Qu.: 0.000 Fa : 2 GdWo : 54 Othr: 2
## Median : 0.00 Median : 0.000 Gd : 3 MnPrv: 157 Shed: 49
## Mean : 15.06 Mean : 2.759 NA's:1453 MnWw : 11 TenC: 1
## 3rd Qu.: 0.00 3rd Qu.: 0.000 NA's :1179 NA's:1406
## Max. :480.00 Max. :738.000
##
## MiscVal MoSold YrSold SaleType
## Min. : 0.00 Min. : 1.000 Min. :2006 WD :1267
## 1st Qu.: 0.00 1st Qu.: 5.000 1st Qu.:2007 New : 122
## Median : 0.00 Median : 6.000 Median :2008 COD : 43
## Mean : 43.49 Mean : 6.322 Mean :2008 ConLD : 9
## 3rd Qu.: 0.00 3rd Qu.: 8.000 3rd Qu.:2009 ConLI : 5
## Max. :15500.00 Max. :12.000 Max. :2010 ConLw : 5
## (Other): 9
## SaleCondition SalePrice
## Abnorml: 101 Min. : 34900
## AdjLand: 4 1st Qu.:129975
## Alloca : 12 Median :163000
## Family : 20 Mean :180921
## Normal :1198 3rd Qu.:214000
## Partial: 125 Max. :755000
##
univariate statistics and plots
par(mfrow=c(2,3))
plot(trainData$MSZoning, main = 'Zoning')
hist(trainData$LotArea, main = 'Lot Area')
hist(trainData$TotalBsmtSF, main = 'Basement sqft')
plot(trainData$Neighborhood, main = 'Neighborhood')
hist(trainData$SalePrice, main = 'Sale price')
hist(trainData$BedroomAbvGr, main = 'Bedroom count')
#Select 2 independent variables i.e., 'GrLivArea' and 'SaleCondition', and a response variable 'SalePrice'
plot(trainData$GrLivArea, trainData$SalePrice, xlab = "Gr LivingArea", ylab = "Sale price", main='LivingArea vs Sale price')
plot(trainData$SaleCondition, trainData$SalePrice, xlab = "Sale condition", ylab = "Sale price", main='Sale condition vs Sale price')
subsetData <- trainData[c('BedroomAbvGr', 'GrLivArea', 'SalePrice')]
#Calculate the co-relations between the 3 variables
corMatrix <- cor(subsetData)
corMatrix
## BedroomAbvGr GrLivArea SalePrice
## BedroomAbvGr 1.0000000 0.5212695 0.1682132
## GrLivArea 0.5212695 1.0000000 0.7086245
## SalePrice 0.1682132 0.7086245 1.0000000
# Plot the co-relations
corrplot(corMatrix)
#Test the co-relations for a confidence level of 0.8
cor.test(subsetData$BedroomAbvGr, subsetData$SalePrice, conf.level = .8)
##
## Pearson's product-moment correlation
##
## data: subsetData$BedroomAbvGr and subsetData$SalePrice
## t = 6.5159, df = 1458, p-value = 9.927e-11
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.1354160 0.2006421
## sample estimates:
## cor
## 0.1682132
cor.test(subsetData$GrLivArea, subsetData$SalePrice, conf.level = .8)
##
## Pearson's product-moment correlation
##
## data: subsetData$GrLivArea and subsetData$SalePrice
## t = 38.348, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.6915087 0.7249450
## sample estimates:
## cor
## 0.7086245
cor.test(subsetData$BedroomAbvGr, subsetData$GrLivArea, conf.level = .8)
##
## Pearson's product-moment correlation
##
## data: subsetData$BedroomAbvGr and subsetData$GrLivArea
## t = 23.323, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.4963921 0.5452915
## sample estimates:
## cor
## 0.5212695
Would you be worried about familywise error? Due to the high number of variables available, and many of which are categorical nature and as I’m focussed on quantitative variables for this analysis, it is possible that I might be making false assumptions leading to FWE.
inverseCorMatrix <- solve(corMatrix)
x <- round(corMatrix %*% inverseCorMatrix, 1)
x
## BedroomAbvGr GrLivArea SalePrice
## BedroomAbvGr 1 0 0
## GrLivArea 0 1 0
## SalePrice 0 0 1
y <- round(inverseCorMatrix %*% corMatrix, 1)
y
## BedroomAbvGr GrLivArea SalePrice
## BedroomAbvGr 1 0 0
## GrLivArea 0 1 0
## SalePrice 0 0 1
lu.decomposition(x)
## $L
## [,1] [,2] [,3]
## [1,] 1 0 0
## [2,] 0 1 0
## [3,] 0 0 1
##
## $U
## [,1] [,2] [,3]
## [1,] 1 0 0
## [2,] 0 1 0
## [3,] 0 0 1
lu.decomposition(y)
## $L
## [,1] [,2] [,3]
## [1,] 1 0 0
## [2,] 0 1 0
## [3,] 0 0 1
##
## $U
## [,1] [,2] [,3]
## [1,] 1 0 0
## [2,] 0 1 0
## [3,] 0 0 1
The decomposition of both x and y matrix resolve to same value.
a <- fitdistr(x=trainData$SalePrice, densfun = "exponential")
value <- a$estimate
data <- rexp(1000, value)
quantile(data, c(.05,.95))
## 5% 95%
## 10542.15 547971.98
#Plot graphs
par(mfrow=c(1,2))
hist(trainData$SalePrice,main = "Actual train data")
#hist(trainData$SalePrice_log,main = "Skew adjusted train data")
hist(data,main = "distrbution data")
5th and 95th percentiles using the cumulative distribution function (CDF)
quantile(data, c(.05,.95))
## 5% 95%
## 10542.15 547971.98
Empirical 5th percentile and 95th percentile of the data
quantile(trainData$SalePrice, c(.05,.95))
## 5% 95%
## 88000 326100
data <- rnorm(length(trainData$SalePrice), mean = mean(trainData$SalePrice), sd=sd(trainData$SalePrice))
hist(data, main = "Normalized data")
I observed there are high statistical outliers in the data in certain cases and calculated the quantile data for each variable I’m considering for this regression model. Based on my observation, I have mostly selected 98 percentile data and removed first 5 percentile data for a particular variable (yearbuild)
#Calculate and remove observations with outliers for variables 'GrLivArea','TotalBsmtSF','TotRmsAbvGrd','GarageArea','YearBuilt'
# Showcasing quantile data for Living area
quantile(trainData$GrLivArea, c(1:100)/100)
## 1% 2% 3% 4% 5% 6% 7% 8% 9% 10%
## 692.18 768.00 796.00 828.80 848.00 864.00 864.00 885.44 895.55 912.00
## 11% 12% 13% 14% 15% 16% 17% 18% 19% 20%
## 924.49 948.00 960.00 980.00 988.00 1004.00 1034.00 1040.00 1054.00 1066.60
## 21% 22% 23% 24% 25% 26% 27% 28% 29% 30%
## 1078.78 1092.00 1109.57 1120.00 1129.50 1144.00 1154.93 1178.00 1196.22 1208.00
## 31% 32% 33% 34% 35% 36% 37% 38% 39% 40%
## 1217.29 1224.00 1236.00 1251.06 1262.00 1279.96 1300.49 1311.68 1324.00 1339.00
## 41% 42% 43% 44% 45% 46% 47% 48% 49% 50%
## 1348.00 1360.00 1368.00 1382.00 1392.55 1414.00 1427.46 1437.96 1452.91 1464.00
## 51% 52% 53% 54% 55% 56% 57% 58% 59% 60%
## 1474.09 1483.36 1494.00 1501.86 1509.45 1525.00 1540.26 1557.22 1571.00 1578.00
## 61% 62% 63% 64% 65% 66% 67% 68% 69% 70%
## 1599.95 1612.74 1626.51 1639.76 1651.35 1660.00 1668.00 1683.12 1694.00 1709.30
## 71% 72% 73% 74% 75% 76% 77% 78% 79% 80%
## 1717.00 1728.00 1739.07 1765.32 1776.75 1792.00 1803.43 1836.00 1849.22 1869.00
## 81% 82% 83% 84% 85% 86% 87% 88% 89% 90%
## 1907.37 1928.00 1949.97 1966.24 1987.30 2020.74 2058.66 2090.00 2120.02 2158.30
## 91% 92% 93% 94% 95% 96% 97% 98% 99% 100%
## 2221.14 2264.12 2328.35 2385.52 2466.10 2545.72 2633.23 2782.38 3123.48 5642.00
trainData_nonOutlier <- trainData[c('SalePrice','GrLivArea','TotalBsmtSF','TotRmsAbvGrd','GarageArea','YearBuilt')]
trainData_nonOutlier <- subset(trainData_nonOutlier, trainData_nonOutlier$GrLivArea<quantile(trainData_nonOutlier$GrLivArea, c(1:100)/100)[98][1])
trainData_nonOutlier <- subset(trainData_nonOutlier, trainData_nonOutlier$TotalBsmtSF<quantile(trainData_nonOutlier$TotalBsmtSF, c(1:100)/100)[98][1])
trainData_nonOutlier <- subset(trainData_nonOutlier, trainData_nonOutlier$TotRmsAbvGrd<quantile(trainData_nonOutlier$TotRmsAbvGrd, c(1:100)/100)[98][1])
trainData_nonOutlier <- subset(trainData_nonOutlier, trainData_nonOutlier$GarageArea<quantile(trainData_nonOutlier$GarageArea, c(1:100)/100)[98][1])
trainData_nonOutlier <- subset(trainData_nonOutlier, trainData_nonOutlier$YearBuilt>quantile(trainData_nonOutlier$YearBuilt, c(1:100)/100)[5][1])
m2 <- lm(SalePrice ~ ., data=trainData_nonOutlier)
summary(m2)
##
## Call:
## lm(formula = SalePrice ~ ., data = trainData_nonOutlier)
##
## Residuals:
## Min 1Q Median 3Q Max
## -118635 -15917 -1968 14031 179803
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.162e+06 7.416e+04 -15.674 < 2e-16 ***
## GrLivArea 7.944e+01 3.635e+00 21.851 < 2e-16 ***
## TotalBsmtSF 4.537e+01 2.612e+00 17.370 < 2e-16 ***
## TotRmsAbvGrd -3.449e+03 1.052e+03 -3.280 0.00107 **
## GarageArea 5.171e+01 5.846e+00 8.846 < 2e-16 ***
## YearBuilt 5.940e+02 3.835e+01 15.491 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 29430 on 1248 degrees of freedom
## Multiple R-squared: 0.7678, Adjusted R-squared: 0.7669
## F-statistic: 825.6 on 5 and 1248 DF, p-value: < 2.2e-16
plot(m2$residuals, main='Residuals')
res <- resid(m2)
qqnorm(res)
qqline(res)
testData <- read.csv("kaggle/test.csv")
pred <- predict(m2, testData)
kaggleResult <- data.frame( Id = testData[,"Id"], SalePrice =pred)
kaggleResult[kaggleResult<0] <- 0
kaggleResult <- replace(kaggleResult,is.na(kaggleResult),0)
write.csv(kaggleResult, file="kaggleResult.csv", row.names = FALSE)
My username is Cheruku and I scored .49038 in first attempt. Will making few more tries to improve my score.