set.seed(4763)
N <- 10
n <- 10000
mu <- sigma <- (N + 1)/2
df <- data.frame(X = runif(n, min=1, max=N),
Y = rnorm(n, mean=mu, sd=sigma))
summary(df$X) Min. 1st Qu. Median Mean 3rd Qu. Max.
1.001 3.285 5.566 5.551 7.825 9.999
Min. 1st Qu. Median Mean 3rd Qu. Max.
-14.489 1.879 5.469 5.542 9.288 28.206
df <- df %>%
dplyr::mutate(A = ifelse(X > x, " X greater than x", " X not greater than x"),
B = ifelse(Y > y, " Y greater than y", " Y not greater than y")) %>%
dplyr::group_by(A, B) %>%
dplyr::summarise(count = dplyr::n()) %>%
dplyr::mutate(probability = count / n)
df <- df %>% # Marginal Probabilities
dplyr::ungroup() %>%
dplyr::group_by(A) %>%
dplyr::summarise(count = sum(count),
probability = sum(probability)) %>%
mutate(B = "Total") %>%
bind_rows(df)
df <- df %>%
dplyr::ungroup() %>%
dplyr::group_by(B) %>%
dplyr::summarise(count = sum(count),
probability = sum(probability)) %>%
dplyr::mutate(A = "Total") %>%
bind_rows(df)
df %>%
dplyr::select(-count) %>%
dplyr::spread(A, probability) %>%
dplyr::rename(" " = B) %>%
kable() %>%
kable_styling()## Error: 'spread' is not an exported object from 'namespace:dplyr'
count_data <- df %>%
dplyr::filter(A != "Total",
B != "Total") %>%
dplyr::select(-probability) %>%
spread(A, count) %>%
as.data.frame()
row.names(count_data) <- count_data$B
count_data <- count_data %>%
dplyr::select(-B) %>%
as.matrix()
fisher.test(count_data)##
## Fisher's Exact Test for Count Data
##
## data: count_data
## p-value = 0.2389
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 0.964496 1.158476
## sample estimates:
## odds ratio
## 1.057068
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: count_data
## X-squared = 1.3872, df = 1, p-value = 0.2389
You are to register for Kaggle.com (free) and compete in the House Prices: Advanced Regression Techniques competition. https://www.kaggle.com/c/house-prices-advanced-regression-techniques. I want you to do the following.**
kaggle <- read.csv("C:/Users/mutue/OneDrive/Documents/Data605/housing/train.csv")
test <- read.csv("C:/Users/mutue/OneDrive/Documents/Data605/housing/test.csv")
str(kaggle)## 'data.frame': 1460 obs. of 81 variables:
## $ Id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ MSSubClass : int 60 20 60 70 60 50 20 60 50 190 ...
## $ MSZoning : Factor w/ 5 levels "C (all)","FV",..: 4 4 4 4 4 4 4 4 5 4 ...
## $ LotFrontage : int 65 80 68 60 84 85 75 NA 51 50 ...
## $ LotArea : int 8450 9600 11250 9550 14260 14115 10084 10382 6120 7420 ...
## $ Street : Factor w/ 2 levels "Grvl","Pave": 2 2 2 2 2 2 2 2 2 2 ...
## $ Alley : Factor w/ 2 levels "Grvl","Pave": NA NA NA NA NA NA NA NA NA NA ...
## $ LotShape : Factor w/ 4 levels "IR1","IR2","IR3",..: 4 4 1 1 1 1 4 1 4 4 ...
## $ LandContour : Factor w/ 4 levels "Bnk","HLS","Low",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ Utilities : Factor w/ 2 levels "AllPub","NoSeWa": 1 1 1 1 1 1 1 1 1 1 ...
## $ LotConfig : Factor w/ 5 levels "Corner","CulDSac",..: 5 3 5 1 3 5 5 1 5 1 ...
## $ LandSlope : Factor w/ 3 levels "Gtl","Mod","Sev": 1 1 1 1 1 1 1 1 1 1 ...
## $ Neighborhood : Factor w/ 25 levels "Blmngtn","Blueste",..: 6 25 6 7 14 12 21 17 18 4 ...
## $ Condition1 : Factor w/ 9 levels "Artery","Feedr",..: 3 2 3 3 3 3 3 5 1 1 ...
## $ Condition2 : Factor w/ 8 levels "Artery","Feedr",..: 3 3 3 3 3 3 3 3 3 1 ...
## $ BldgType : Factor w/ 5 levels "1Fam","2fmCon",..: 1 1 1 1 1 1 1 1 1 2 ...
## $ HouseStyle : Factor w/ 8 levels "1.5Fin","1.5Unf",..: 6 3 6 6 6 1 3 6 1 2 ...
## $ OverallQual : int 7 6 7 7 8 5 8 7 7 5 ...
## $ OverallCond : int 5 8 5 5 5 5 5 6 5 6 ...
## $ YearBuilt : int 2003 1976 2001 1915 2000 1993 2004 1973 1931 1939 ...
## $ YearRemodAdd : int 2003 1976 2002 1970 2000 1995 2005 1973 1950 1950 ...
## $ RoofStyle : Factor w/ 6 levels "Flat","Gable",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ RoofMatl : Factor w/ 8 levels "ClyTile","CompShg",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ Exterior1st : Factor w/ 15 levels "AsbShng","AsphShn",..: 13 9 13 14 13 13 13 7 4 9 ...
## $ Exterior2nd : Factor w/ 16 levels "AsbShng","AsphShn",..: 14 9 14 16 14 14 14 7 16 9 ...
## $ MasVnrType : Factor w/ 4 levels "BrkCmn","BrkFace",..: 2 3 2 3 2 3 4 4 3 3 ...
## $ MasVnrArea : int 196 0 162 0 350 0 186 240 0 0 ...
## $ ExterQual : Factor w/ 4 levels "Ex","Fa","Gd",..: 3 4 3 4 3 4 3 4 4 4 ...
## $ ExterCond : Factor w/ 5 levels "Ex","Fa","Gd",..: 5 5 5 5 5 5 5 5 5 5 ...
## $ Foundation : Factor w/ 6 levels "BrkTil","CBlock",..: 3 2 3 1 3 6 3 2 1 1 ...
## $ BsmtQual : Factor w/ 4 levels "Ex","Fa","Gd",..: 3 3 3 4 3 3 1 3 4 4 ...
## $ BsmtCond : Factor w/ 4 levels "Fa","Gd","Po",..: 4 4 4 2 4 4 4 4 4 4 ...
## $ BsmtExposure : Factor w/ 4 levels "Av","Gd","Mn",..: 4 2 3 4 1 4 1 3 4 4 ...
## $ BsmtFinType1 : Factor w/ 6 levels "ALQ","BLQ","GLQ",..: 3 1 3 1 3 3 3 1 6 3 ...
## $ BsmtFinSF1 : int 706 978 486 216 655 732 1369 859 0 851 ...
## $ BsmtFinType2 : Factor w/ 6 levels "ALQ","BLQ","GLQ",..: 6 6 6 6 6 6 6 2 6 6 ...
## $ BsmtFinSF2 : int 0 0 0 0 0 0 0 32 0 0 ...
## $ BsmtUnfSF : int 150 284 434 540 490 64 317 216 952 140 ...
## $ TotalBsmtSF : int 856 1262 920 756 1145 796 1686 1107 952 991 ...
## $ Heating : Factor w/ 6 levels "Floor","GasA",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ HeatingQC : Factor w/ 5 levels "Ex","Fa","Gd",..: 1 1 1 3 1 1 1 1 3 1 ...
## $ CentralAir : Factor w/ 2 levels "N","Y": 2 2 2 2 2 2 2 2 2 2 ...
## $ Electrical : Factor w/ 5 levels "FuseA","FuseF",..: 5 5 5 5 5 5 5 5 2 5 ...
## $ X1stFlrSF : int 856 1262 920 961 1145 796 1694 1107 1022 1077 ...
## $ X2ndFlrSF : int 854 0 866 756 1053 566 0 983 752 0 ...
## $ LowQualFinSF : int 0 0 0 0 0 0 0 0 0 0 ...
## $ GrLivArea : int 1710 1262 1786 1717 2198 1362 1694 2090 1774 1077 ...
## $ BsmtFullBath : int 1 0 1 1 1 1 1 1 0 1 ...
## $ BsmtHalfBath : int 0 1 0 0 0 0 0 0 0 0 ...
## $ FullBath : int 2 2 2 1 2 1 2 2 2 1 ...
## $ HalfBath : int 1 0 1 0 1 1 0 1 0 0 ...
## $ BedroomAbvGr : int 3 3 3 3 4 1 3 3 2 2 ...
## $ KitchenAbvGr : int 1 1 1 1 1 1 1 1 2 2 ...
## $ KitchenQual : Factor w/ 4 levels "Ex","Fa","Gd",..: 3 4 3 3 3 4 3 4 4 4 ...
## $ TotRmsAbvGrd : int 8 6 6 7 9 5 7 7 8 5 ...
## $ Functional : Factor w/ 7 levels "Maj1","Maj2",..: 7 7 7 7 7 7 7 7 3 7 ...
## $ Fireplaces : int 0 1 1 1 1 0 1 2 2 2 ...
## $ FireplaceQu : Factor w/ 5 levels "Ex","Fa","Gd",..: NA 5 5 3 5 NA 3 5 5 5 ...
## $ GarageType : Factor w/ 6 levels "2Types","Attchd",..: 2 2 2 6 2 2 2 2 6 2 ...
## $ GarageYrBlt : int 2003 1976 2001 1998 2000 1993 2004 1973 1931 1939 ...
## $ GarageFinish : Factor w/ 3 levels "Fin","RFn","Unf": 2 2 2 3 2 3 2 2 3 2 ...
## $ GarageCars : int 2 2 2 3 3 2 2 2 2 1 ...
## $ GarageArea : int 548 460 608 642 836 480 636 484 468 205 ...
## $ GarageQual : Factor w/ 5 levels "Ex","Fa","Gd",..: 5 5 5 5 5 5 5 5 2 3 ...
## $ GarageCond : Factor w/ 5 levels "Ex","Fa","Gd",..: 5 5 5 5 5 5 5 5 5 5 ...
## $ PavedDrive : Factor w/ 3 levels "N","P","Y": 3 3 3 3 3 3 3 3 3 3 ...
## $ WoodDeckSF : int 0 298 0 0 192 40 255 235 90 0 ...
## $ OpenPorchSF : int 61 0 42 35 84 30 57 204 0 4 ...
## $ EnclosedPorch: int 0 0 0 272 0 0 0 228 205 0 ...
## $ X3SsnPorch : int 0 0 0 0 0 320 0 0 0 0 ...
## $ ScreenPorch : int 0 0 0 0 0 0 0 0 0 0 ...
## $ PoolArea : int 0 0 0 0 0 0 0 0 0 0 ...
## $ PoolQC : Factor w/ 3 levels "Ex","Fa","Gd": NA NA NA NA NA NA NA NA NA NA ...
## $ Fence : Factor w/ 4 levels "GdPrv","GdWo",..: NA NA NA NA NA 3 NA NA NA NA ...
## $ MiscFeature : Factor w/ 4 levels "Gar2","Othr",..: NA NA NA NA NA 3 NA 3 NA NA ...
## $ MiscVal : int 0 0 0 0 0 700 0 350 0 0 ...
## $ MoSold : int 2 5 9 2 12 10 8 11 4 1 ...
## $ YrSold : int 2008 2007 2008 2006 2008 2009 2007 2009 2008 2008 ...
## $ SaleType : Factor w/ 9 levels "COD","Con","ConLD",..: 9 9 9 9 9 9 9 9 9 9 ...
## $ SaleCondition: Factor w/ 6 levels "Abnorml","AdjLand",..: 5 5 5 1 5 5 5 5 1 5 ...
## $ SalePrice : int 208500 181500 223500 140000 250000 143000 307000 200000 129900 118000 ...
| Name | Piped data |
| Number of rows | 1460 |
| Number of columns | 38 |
| _______________________ | |
| Column type frequency: | |
| numeric | 38 |
| ________________________ | |
| Group variables | None |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| Id | 0 | 1.00 | 730.50 | 421.61 | 1 | 365.75 | 730.5 | 1095.25 | 1460 | ▇▇▇▇▇ |
| MSSubClass | 0 | 1.00 | 56.90 | 42.30 | 20 | 20.00 | 50.0 | 70.00 | 190 | ▇▅▂▁▁ |
| LotFrontage | 259 | 0.82 | 70.05 | 24.28 | 21 | 59.00 | 69.0 | 80.00 | 313 | ▇▃▁▁▁ |
| LotArea | 0 | 1.00 | 10516.83 | 9981.26 | 1300 | 7553.50 | 9478.5 | 11601.50 | 215245 | ▇▁▁▁▁ |
| OverallQual | 0 | 1.00 | 6.10 | 1.38 | 1 | 5.00 | 6.0 | 7.00 | 10 | ▁▂▇▅▁ |
| OverallCond | 0 | 1.00 | 5.58 | 1.11 | 1 | 5.00 | 5.0 | 6.00 | 9 | ▁▁▇▅▁ |
| YearBuilt | 0 | 1.00 | 1971.27 | 30.20 | 1872 | 1954.00 | 1973.0 | 2000.00 | 2010 | ▁▂▃▆▇ |
| YearRemodAdd | 0 | 1.00 | 1984.87 | 20.65 | 1950 | 1967.00 | 1994.0 | 2004.00 | 2010 | ▅▂▂▃▇ |
| MasVnrArea | 8 | 0.99 | 103.69 | 181.07 | 0 | 0.00 | 0.0 | 166.00 | 1600 | ▇▁▁▁▁ |
| BsmtFinSF1 | 0 | 1.00 | 443.64 | 456.10 | 0 | 0.00 | 383.5 | 712.25 | 5644 | ▇▁▁▁▁ |
| BsmtFinSF2 | 0 | 1.00 | 46.55 | 161.32 | 0 | 0.00 | 0.0 | 0.00 | 1474 | ▇▁▁▁▁ |
| BsmtUnfSF | 0 | 1.00 | 567.24 | 441.87 | 0 | 223.00 | 477.5 | 808.00 | 2336 | ▇▅▂▁▁ |
| TotalBsmtSF | 0 | 1.00 | 1057.43 | 438.71 | 0 | 795.75 | 991.5 | 1298.25 | 6110 | ▇▃▁▁▁ |
| X1stFlrSF | 0 | 1.00 | 1162.63 | 386.59 | 334 | 882.00 | 1087.0 | 1391.25 | 4692 | ▇▅▁▁▁ |
| X2ndFlrSF | 0 | 1.00 | 346.99 | 436.53 | 0 | 0.00 | 0.0 | 728.00 | 2065 | ▇▃▂▁▁ |
| LowQualFinSF | 0 | 1.00 | 5.84 | 48.62 | 0 | 0.00 | 0.0 | 0.00 | 572 | ▇▁▁▁▁ |
| GrLivArea | 0 | 1.00 | 1515.46 | 525.48 | 334 | 1129.50 | 1464.0 | 1776.75 | 5642 | ▇▇▁▁▁ |
| BsmtFullBath | 0 | 1.00 | 0.43 | 0.52 | 0 | 0.00 | 0.0 | 1.00 | 3 | ▇▆▁▁▁ |
| BsmtHalfBath | 0 | 1.00 | 0.06 | 0.24 | 0 | 0.00 | 0.0 | 0.00 | 2 | ▇▁▁▁▁ |
| FullBath | 0 | 1.00 | 1.57 | 0.55 | 0 | 1.00 | 2.0 | 2.00 | 3 | ▁▇▁▇▁ |
| HalfBath | 0 | 1.00 | 0.38 | 0.50 | 0 | 0.00 | 0.0 | 1.00 | 2 | ▇▁▅▁▁ |
| BedroomAbvGr | 0 | 1.00 | 2.87 | 0.82 | 0 | 2.00 | 3.0 | 3.00 | 8 | ▁▇▂▁▁ |
| KitchenAbvGr | 0 | 1.00 | 1.05 | 0.22 | 0 | 1.00 | 1.0 | 1.00 | 3 | ▁▇▁▁▁ |
| TotRmsAbvGrd | 0 | 1.00 | 6.52 | 1.63 | 2 | 5.00 | 6.0 | 7.00 | 14 | ▂▇▇▁▁ |
| Fireplaces | 0 | 1.00 | 0.61 | 0.64 | 0 | 0.00 | 1.0 | 1.00 | 3 | ▇▇▁▁▁ |
| GarageYrBlt | 81 | 0.94 | 1978.51 | 24.69 | 1900 | 1961.00 | 1980.0 | 2002.00 | 2010 | ▁▁▅▅▇ |
| GarageCars | 0 | 1.00 | 1.77 | 0.75 | 0 | 1.00 | 2.0 | 2.00 | 4 | ▁▃▇▂▁ |
| GarageArea | 0 | 1.00 | 472.98 | 213.80 | 0 | 334.50 | 480.0 | 576.00 | 1418 | ▂▇▃▁▁ |
| WoodDeckSF | 0 | 1.00 | 94.24 | 125.34 | 0 | 0.00 | 0.0 | 168.00 | 857 | ▇▂▁▁▁ |
| OpenPorchSF | 0 | 1.00 | 46.66 | 66.26 | 0 | 0.00 | 25.0 | 68.00 | 547 | ▇▁▁▁▁ |
| EnclosedPorch | 0 | 1.00 | 21.95 | 61.12 | 0 | 0.00 | 0.0 | 0.00 | 552 | ▇▁▁▁▁ |
| X3SsnPorch | 0 | 1.00 | 3.41 | 29.32 | 0 | 0.00 | 0.0 | 0.00 | 508 | ▇▁▁▁▁ |
| ScreenPorch | 0 | 1.00 | 15.06 | 55.76 | 0 | 0.00 | 0.0 | 0.00 | 480 | ▇▁▁▁▁ |
| PoolArea | 0 | 1.00 | 2.76 | 40.18 | 0 | 0.00 | 0.0 | 0.00 | 738 | ▇▁▁▁▁ |
| MiscVal | 0 | 1.00 | 43.49 | 496.12 | 0 | 0.00 | 0.0 | 0.00 | 15500 | ▇▁▁▁▁ |
| MoSold | 0 | 1.00 | 6.32 | 2.70 | 1 | 5.00 | 6.0 | 8.00 | 12 | ▃▆▇▃▃ |
| YrSold | 0 | 1.00 | 2007.82 | 1.33 | 2006 | 2007.00 | 2008.0 | 2009.00 | 2010 | ▇▇▇▇▅ |
| SalePrice | 0 | 1.00 | 180921.20 | 79442.50 | 34900 | 129975.00 | 163000.0 | 214000.00 | 755000 | ▇▅▁▁▁ |
kaggle %>% ggplot(aes(SalePrice)) +
geom_histogram(fill="blue") +
theme_fivethirtyeight() +
labs(title="SalePrice") -> p1
kaggle %>% ggplot(aes(log(SalePrice))) +
geom_histogram(fill="lightblue") +
theme_fivethirtyeight() +
labs(title="Log (SalePrice)") -> p2
grid.arrange(p1, p2, ncol=2)## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
kaggle %>%
mutate(MySqFt = X1stFlrSF + X2ndFlrSF) %>%
ggplot( aes(x=MySqFt, y=log(SalePrice)))+
geom_point(fill="blue") +
theme_fivethirtyeight() +
labs(title="SqFt vs log(Sale Price)")kaggle %>%
ggplot( aes(x=GarageArea, y=log(SalePrice)))+
geom_point(fill="blue") +
theme_fivethirtyeight() +
labs(title="Garage Area vs log(Sale Price)")pairs(~ OverallQual + YearBuilt +
GarageCars + GrLivArea + TotalBsmtSF + TotRmsAbvGrd +
YrSold + SalePrice, data = kaggle, main = "House Prices")cor_dat <- data.frame(kaggle$GarageArea, kaggle$GrLivArea, kaggle$LotFrontage)
cor_dat <- cor_dat[complete.cases(cor_dat), ]
colnames(cor_dat) <- c("GarageArea", "GrLivArea", "LotFrontage")
cor <- cor(cor_dat, method = "pearson", use = "complete.obs")
kable(cor) %>%
kable_styling()| GarageArea | GrLivArea | LotFrontage | |
|---|---|---|---|
| GarageArea | 1.0000000 | 0.4737098 | 0.3449967 |
| GrLivArea | 0.4737098 | 1.0000000 | 0.4027974 |
| LotFrontage | 0.3449967 | 0.4027974 | 1.0000000 |
Correlation of 0.263 with a pvalue of < 2.2e-16 - Suggesting we would reject the null hypothesis. The confidence interval is [0.2315, 0.2940]
##
## Pearson's product-moment correlation
##
## data: kaggle$GrLivArea and kaggle$LotArea
## t = 10.414, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.2315997 0.2940809
## sample estimates:
## cor
## 0.2631162
Correlation of 0.180 with a pvalue of < 2.2e-16 - Again, suggesting we would reject the null hypothesis The confidence interval is [0.1477, 0.2126]
##
## Pearson's product-moment correlation
##
## data: kaggle$LotArea and kaggle$GarageArea
## t = 7.0034, df = 1458, p-value = 3.803e-12
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.1477356 0.2126767
## sample estimates:
## cor
## 0.1804028
Correlation of 0.467 with a pvalue of < 2.2e-16 - Once again, suggesting we would reject the null hypothesis The confidence interval is [0.4423, 0.4947]
##
## Pearson's product-moment correlation
##
## data: kaggle$GarageArea and kaggle$GrLivArea
## t = 20.276, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.4423993 0.4947713
## sample estimates:
## cor
## 0.4689975
The familywise error rate comes in play when multiple statistical analyses are conducted on the same data. It helps assess the chance of a Type 1 Error. This assessment can be completed by calculating the FWER or Family Wise Errror Rate = 1 - (1-alpha)^k, where alpha is the significance and k is the number of data comparisons. So, 1-(1-0.05)^3 = 0.1426.
Invert your correlation matrix from above. (This is known as the precision matrix and contains variance inflation factors on the diagonal.)
## [,1] [,2] [,3]
## [1,] 1.0000000 0.4737098 0.3449967
## [2,] 0.4737098 1.0000000 0.4027974
## [3,] 0.3449967 0.4027974 1.0000000
| 1.3382921 | -0.5347486 | -0.2463110 |
| -0.5347486 | 1.4073399 | -0.3823863 |
| -0.2463110 | -0.3823863 | 1.2390007 |
Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix.
This will generate the identity matrix.
| 1 | 0 | 0 |
| 0 | 1 | 0 |
| 0 | 0 | 1 |
This will too.
| 1 | 0 | 0 |
| 0 | 1 | 0 |
| 0 | 0 | 1 |
Execute the LU decomposition on the matrix.
The LU decomposition should yield the correlation matrix after multiplying the two components. In other words this:
A <- lu_decomposition$L %*% lu_decomposition$U %>%
as.matrix()
colnames(A) <- colnames(cor_matrix)
rownames(A) <- rownames(cor_matrix)
A %>%
kable() %>%
kable_styling()| 1.0000000 | 0.4737098 | 0.3449967 |
| 0.4737098 | 1.0000000 | 0.4027974 |
| 0.3449967 | 0.4027974 | 1.0000000 |
Should match this:
| 1.0000000 | 0.4737098 | 0.3449967 |
| 0.4737098 | 1.0000000 | 0.4027974 |
| 0.3449967 | 0.4027974 | 1.0000000 |
Which it does.
Many times, it makes sense to fit a closed form distribution to data. Select a variable in the Kaggle.com training dataset that is skewed to the right, shift it so that the minimum value is absolutely above zero if necessary. Then load the MASS package and run fitdistr to fit an exponential probability density function. (See https://stat.ethz.ch/R-manual/R-devel/library/MASS/html/fitdistr.html ). Find the optimal value of for this distribution, and then take 1000 samples from this exponential distribution using this value (e.g., rexp(1000, )). Plot a histogram and compare it with a histogram of your original variable. Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function (CDF). Also generate a 95% confidence interval from the empirical data, assuming normality. Finally, provide the empirical 5th percentile and 95th percentile of the data. Discuss.
kaggle %>% ggplot(aes(kaggle$LotArea)) +
geom_histogram(fill="blue") +
theme_fivethirtyeight() +
labs(title="LotArea") -> p1
p1## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
| skim_type | skim_variable | n_missing | complete_rate | numeric.mean | numeric.sd | numeric.p0 | numeric.p25 | numeric.p50 | numeric.p75 | numeric.p100 | numeric.hist |
|---|---|---|---|---|---|---|---|---|---|---|---|
| numeric | data | 0 | 1 | 10516.83 | 9981.265 | 1300 | 7553.5 | 9478.5 | 11601.5 | 215245 | ▇▁▁▁▁ |
new_LotArea <- kaggle %>%
mutate(LotArea = LotArea - 1300)
skim(new_LotArea$LotArea) %>%
kable() %>%
kable_styling()| skim_type | skim_variable | n_missing | complete_rate | numeric.mean | numeric.sd | numeric.p0 | numeric.p25 | numeric.p50 | numeric.p75 | numeric.p100 | numeric.hist |
|---|---|---|---|---|---|---|---|---|---|---|---|
| numeric | data | 0 | 1 | 9216.828 | 9981.265 | 0 | 6253.5 | 8178.5 | 10301.5 | 213945 | ▇▁▁▁▁ |
## rate
## 1.084972e-04
## (2.839501e-06)
histogram_data <- data.frame(Value = LotArea_dist, Data = c("New Distribution"))
histogram_data <- data.frame(Value = new_LotArea$LotArea, Data = c("Old Distribution")) %>%
rbind(histogram_data)
ggplot(histogram_data, aes(Value, fill = Data)) +
geom_histogram(bins = 30, alpha = 0.5) +
theme_fivethirtyeight()## 5% 95%
## 449.4322 26913.5289
## upper mean lower
## 9762.149 9180.779 8599.410
## 5% 95%
## 449.4322 26913.5289
# fix missing values with mice
#tmp_data <- mice(kaggle,maxit=3, method='rf',seed=20, print=F)
#kaggle <- complete(tmp_data,1)
kaggle <- kaggle %>%
dplyr::mutate(TotalSF = TotalBsmtSF + X1stFlrSF + X2ndFlrSF) %>%
dplyr::mutate(TotalBR = FullBath + (0.5*HalfBath) + (0.5 * BsmtHalfBath)) %>%
dplyr::mutate(QC = OverallQual + OverallCond) %>%
dplyr::mutate(EC = as.numeric(ExterQual) + as.numeric(ExterCond)) %>%
dplyr::mutate(EC =factor(EC)) %>%
dplyr::mutate(hasPool = dplyr::if_else(PoolArea > 0, 1, 0)) %>%
dplyr::mutate(hasFence = dplyr::if_else(!is.na(Fence), 1, 0)) %>%
dplyr::mutate(hasFirePlace = dplyr::if_else(Fireplaces > 0, 1, 0)) %>%
dplyr::mutate(SecondFloor = dplyr::if_else(X2ndFlrSF > 0, 1, 0)) %>%
dplyr::mutate(hasPorch = dplyr::if_else(X3SsnPorch>0, 1, 0)|dplyr::if_else(ScreenPorch>0, 1, 0)) %>%
dplyr::mutate(QC =factor(QC)) %>%
dplyr::mutate(SFPR = TotalSF / TotRmsAbvGrd ) %>%
dplyr::mutate(HYr = 2020-YearBuilt-(YearRemodAdd-YearBuilt)) %>%
dplyr::mutate(hasAll = hasPool + hasFence + hasFirePlace + hasPorch + SecondFloor) %>%
dplyr::mutate(Upgrade = dplyr::if_else(PoolArea > 0,1,0) + dplyr::if_else(X3SsnPorch > 0,1,0) + Fireplaces + dplyr::if_else(WoodDeckSF > 0,1,0) + dplyr::if_else(EnclosedPorch > 0,1,0) + as.numeric(CentralAir)) %>%
dplyr::mutate(QualSF = (TotalSF-LowQualFinSF)/TotalSF)
fit2 <- lm(SalePrice ~ TotalSF + TotalBR + QC + EC + HYr + MSSubClass + MSZoning + GrLivArea + LotConfig + hasPool + hasFirePlace + Condition1 + CentralAir + KitchenQual + SecondFloor + hasFence + Functional, data=kaggle)
summary(fit2)##
## Call:
## lm(formula = SalePrice ~ TotalSF + TotalBR + QC + EC + HYr +
## MSSubClass + MSZoning + GrLivArea + LotConfig + hasPool +
## hasFirePlace + Condition1 + CentralAir + KitchenQual + SecondFloor +
## hasFence + Functional, data = kaggle)
##
## Residuals:
## Min 1Q Median 3Q Max
## -444446 -16978 -1573 14524 279969
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -51512.018 66144.753 -0.779 0.436242
## TotalSF 23.785 3.513 6.770 1.89e-11 ***
## TotalBR 11592.425 2534.725 4.573 5.22e-06 ***
## QC4 -23378.304 51421.842 -0.455 0.649440
## QC5 -16745.829 41871.030 -0.400 0.689263
## QC6 -41315.936 43974.100 -0.940 0.347609
## QC7 -20897.723 41571.884 -0.503 0.615262
## QC8 -28967.855 40743.974 -0.711 0.477219
## QC9 -25077.684 40465.649 -0.620 0.535538
## QC10 -16362.375 40497.212 -0.404 0.686247
## QC11 -15080.585 40544.775 -0.372 0.709987
## QC12 -11779.224 40548.446 -0.290 0.771479
## QC13 3999.508 40613.071 0.098 0.921567
## QC14 20524.622 40648.904 0.505 0.613691
## QC15 21860.597 41142.860 0.531 0.595272
## QC16 45313.067 41328.768 1.096 0.273090
## QC17 19397.708 44511.901 0.436 0.663058
## QC19 128700.688 55228.696 2.330 0.019930 *
## EC4 70368.719 53167.620 1.324 0.185876
## EC5 22232.069 56989.511 0.390 0.696516
## EC6 84728.687 50942.679 1.663 0.096493 .
## EC7 56860.761 51200.994 1.111 0.266957
## EC8 78612.254 51033.213 1.540 0.123685
## EC9 62022.984 51097.192 1.214 0.225019
## HYr -219.484 68.183 -3.219 0.001316 **
## MSSubClass -89.406 25.869 -3.456 0.000564 ***
## MSZoningFV 38361.797 13223.421 2.901 0.003777 **
## MSZoningRH 21098.906 15068.646 1.400 0.161679
## MSZoningRL 28584.931 12335.023 2.317 0.020626 *
## MSZoningRM 16975.502 12413.919 1.367 0.171701
## GrLivArea 36.217 6.474 5.594 2.67e-08 ***
## LotConfigCulDSac 18761.877 4390.212 4.274 2.05e-05 ***
## LotConfigFR2 -740.012 5766.052 -0.128 0.897899
## LotConfigFR3 -10378.360 18560.113 -0.559 0.576131
## LotConfigInside 410.257 2520.286 0.163 0.870713
## hasPool -36059.562 14223.137 -2.535 0.011344 *
## hasFirePlace 12637.135 2239.022 5.644 2.01e-08 ***
## Condition1Feedr -1773.982 6783.471 -0.262 0.793733
## Condition1Norm 17911.472 5599.130 3.199 0.001410 **
## Condition1PosA -5435.981 14040.297 -0.387 0.698689
## Condition1PosN -15409.946 9984.708 -1.543 0.122970
## Condition1RRAe -4746.025 12278.674 -0.387 0.699166
## Condition1RRAn 19555.132 8967.107 2.181 0.029367 *
## Condition1RRNe -13572.393 26089.820 -0.520 0.602994
## Condition1RRNn 15956.776 17351.289 0.920 0.357924
## CentralAirY 10778.818 4515.625 2.387 0.017118 *
## KitchenQualFa -62999.657 8190.849 -7.691 2.72e-14 ***
## KitchenQualGd -48760.696 4545.333 -10.728 < 2e-16 ***
## KitchenQualTA -55869.137 5177.144 -10.791 < 2e-16 ***
## SecondFloor -15067.205 3327.616 -4.528 6.46e-06 ***
## hasFence -3644.789 2567.144 -1.420 0.155893
## FunctionalMaj2 2889.770 19215.776 0.150 0.880482
## FunctionalMin1 32246.115 12085.020 2.668 0.007712 **
## FunctionalMin2 24523.094 11956.720 2.051 0.040453 *
## FunctionalMod 21729.215 13745.897 1.581 0.114154
## FunctionalSev -42359.647 37306.651 -1.135 0.256383
## FunctionalTyp 45433.764 10310.310 4.407 1.13e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 35580 on 1403 degrees of freedom
## Multiple R-squared: 0.8072, Adjusted R-squared: 0.7995
## F-statistic: 104.9 on 56 and 1403 DF, p-value: < 2.2e-16
# fix missing values with mice
#tmp_data <- mice(test,maxit=3, method='rf',seed=20, print=F)
#test <- complete(tmp_data,1)
test <- test %>%
dplyr::mutate(TotalSF = TotalBsmtSF + X1stFlrSF + X2ndFlrSF) %>%
dplyr::mutate(TotalBR = FullBath + (0.5*HalfBath) + (0.5 * BsmtHalfBath)) %>%
dplyr::mutate(QC = OverallQual + OverallCond) %>%
dplyr::mutate(EC = as.numeric(ExterQual) + as.numeric(ExterCond)) %>%
dplyr::mutate(EC=factor(EC)) %>%
dplyr::mutate(hasFence = dplyr::if_else(!is.na(Fence), 1, 0)) %>%
dplyr::mutate(hasPool = dplyr::if_else(PoolArea > 0, 1, 0)) %>%
dplyr::mutate(SecondFloor = dplyr::if_else(X2ndFlrSF > 0, 1, 0)) %>%
dplyr::mutate(hasPorch = dplyr::if_else(X3SsnPorch>0, 1, 0)|dplyr::if_else(ScreenPorch>0, 1, 0)) %>%
dplyr::mutate(hasFirePlace = dplyr::if_else(Fireplaces > 0, 1, 0)) %>%
dplyr::mutate(hasAll = hasPool + hasFence + hasFirePlace + hasPorch + SecondFloor) %>%
dplyr::mutate(QC =factor(QC)) %>%
dplyr::mutate(SFPR = TotalSF / TotRmsAbvGrd ) %>%
dplyr::mutate(HYr = 2020-YearBuilt-(YearRemodAdd-YearBuilt)) %>%
dplyr::mutate(Upgrade = dplyr::if_else(PoolArea > 0,1,0) + dplyr::if_else(X3SsnPorch > 0,1,0) + Fireplaces + dplyr::if_else(WoodDeckSF > 0,1,0) + dplyr::if_else(EnclosedPorch > 0,1,0) + as.numeric(CentralAir)) %>%
dplyr::mutate(QualSF = (TotalSF-LowQualFinSF)/TotalSF) %>%
dplyr::mutate(ALot = LotFrontage / LotArea)
fit2$xlevels[["QC"]] <- union(fit2$xlevels[["QC"]], levels(test$QC))## Warning in predict.lm(fit2, newdata = test): prediction from a rank-deficient
## fit may be misleading
##
## Call:
## lm(formula = SalePrice ~ TotalSF + TotalBR + QC + EC + HYr +
## MSSubClass + MSZoning + GrLivArea + LotConfig + hasPool +
## hasFirePlace + Condition1 + CentralAir + KitchenQual + SecondFloor +
## hasFence + Functional, data = kaggle)
##
## Residuals:
## Min 1Q Median 3Q Max
## -444446 -16978 -1573 14524 279969
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -51512.018 66144.753 -0.779 0.436242
## TotalSF 23.785 3.513 6.770 1.89e-11 ***
## TotalBR 11592.425 2534.725 4.573 5.22e-06 ***
## QC4 -23378.304 51421.842 -0.455 0.649440
## QC5 -16745.829 41871.030 -0.400 0.689263
## QC6 -41315.936 43974.100 -0.940 0.347609
## QC7 -20897.723 41571.884 -0.503 0.615262
## QC8 -28967.855 40743.974 -0.711 0.477219
## QC9 -25077.684 40465.649 -0.620 0.535538
## QC10 -16362.375 40497.212 -0.404 0.686247
## QC11 -15080.585 40544.775 -0.372 0.709987
## QC12 -11779.224 40548.446 -0.290 0.771479
## QC13 3999.508 40613.071 0.098 0.921567
## QC14 20524.622 40648.904 0.505 0.613691
## QC15 21860.597 41142.860 0.531 0.595272
## QC16 45313.067 41328.768 1.096 0.273090
## QC17 19397.708 44511.901 0.436 0.663058
## QC19 128700.688 55228.696 2.330 0.019930 *
## EC4 70368.719 53167.620 1.324 0.185876
## EC5 22232.069 56989.511 0.390 0.696516
## EC6 84728.687 50942.679 1.663 0.096493 .
## EC7 56860.761 51200.994 1.111 0.266957
## EC8 78612.254 51033.213 1.540 0.123685
## EC9 62022.984 51097.192 1.214 0.225019
## HYr -219.484 68.183 -3.219 0.001316 **
## MSSubClass -89.406 25.869 -3.456 0.000564 ***
## MSZoningFV 38361.797 13223.421 2.901 0.003777 **
## MSZoningRH 21098.906 15068.646 1.400 0.161679
## MSZoningRL 28584.931 12335.023 2.317 0.020626 *
## MSZoningRM 16975.502 12413.919 1.367 0.171701
## GrLivArea 36.217 6.474 5.594 2.67e-08 ***
## LotConfigCulDSac 18761.877 4390.212 4.274 2.05e-05 ***
## LotConfigFR2 -740.012 5766.052 -0.128 0.897899
## LotConfigFR3 -10378.360 18560.113 -0.559 0.576131
## LotConfigInside 410.257 2520.286 0.163 0.870713
## hasPool -36059.562 14223.137 -2.535 0.011344 *
## hasFirePlace 12637.135 2239.022 5.644 2.01e-08 ***
## Condition1Feedr -1773.982 6783.471 -0.262 0.793733
## Condition1Norm 17911.472 5599.130 3.199 0.001410 **
## Condition1PosA -5435.981 14040.297 -0.387 0.698689
## Condition1PosN -15409.946 9984.708 -1.543 0.122970
## Condition1RRAe -4746.025 12278.674 -0.387 0.699166
## Condition1RRAn 19555.132 8967.107 2.181 0.029367 *
## Condition1RRNe -13572.393 26089.820 -0.520 0.602994
## Condition1RRNn 15956.776 17351.289 0.920 0.357924
## CentralAirY 10778.818 4515.625 2.387 0.017118 *
## KitchenQualFa -62999.657 8190.849 -7.691 2.72e-14 ***
## KitchenQualGd -48760.696 4545.333 -10.728 < 2e-16 ***
## KitchenQualTA -55869.137 5177.144 -10.791 < 2e-16 ***
## SecondFloor -15067.205 3327.616 -4.528 6.46e-06 ***
## hasFence -3644.789 2567.144 -1.420 0.155893
## FunctionalMaj2 2889.770 19215.776 0.150 0.880482
## FunctionalMin1 32246.115 12085.020 2.668 0.007712 **
## FunctionalMin2 24523.094 11956.720 2.051 0.040453 *
## FunctionalMod 21729.215 13745.897 1.581 0.114154
## FunctionalSev -42359.647 37306.651 -1.135 0.256383
## FunctionalTyp 45433.764 10310.310 4.407 1.13e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 35580 on 1403 degrees of freedom
## Multiple R-squared: 0.8072, Adjusted R-squared: 0.7995
## F-statistic: 104.9 on 56 and 1403 DF, p-value: < 2.2e-16
“Kaggle Rank”