library(e1071)
library(pander)
library(ggplot2)
library(reshape2)
library(MASS)
library(matrixcalc)
library(car)
library(psych)
library(compositions)
library(DiagrammeR)
library(pracma)
library(knitr)
library(stats)
library(kableExtra)
library(grid)
library(gridExtra)
library(data.table)
library(RCurl)
options(scipen=8)
set.seed(0305)
Using R, generate a random variable X that has 10,000 random uniform numbers from 1 to N, where N can be any number of your choosing greater than or equal to 6. Then generate a random variable Y that has 10,000 random normal numbers with a mean of (N+1)/2.
Probability. Calculate as a minimum the below probabilities a through c. Assume the small letter “x” is estimated as the median of the X variable, and the small letter “y” is estimated as the 1st quartile of the Y variable. Interpret the meaning of all probabilities.
#Random uniform and normal generators
X <- runif(10000, min = 1, max=6)
head(X)
## [1] 5.002059 5.852818 3.280193 3.015978 2.319642 5.673100
Y <- rnorm(10000,mean=(7/2),sd=(7/2))
x=median(X)
y<-quantile(Y, 0.25)
probstr <-as.data.frame(cbind(X,Y))
summary(probstr)
## X Y
## Min. :1.001 Min. :-9.592
## 1st Qu.:2.260 1st Qu.: 1.109
## Median :3.495 Median : 3.469
## Mean :3.509 Mean : 3.496
## 3rd Qu.:4.767 3rd Qu.: 5.859
## Max. :6.000 Max. :16.886
#Create a matrix based on the probabilities conditions
#Total Observations
total.obs<-nrow(probstr)
#a. P(X>x | X>y) b. P(X>x, Y>y) c. P(X<x | X>y)
#Number of observations where X>x and X>y
XgxXgy.data <- probstr[which( probstr$X >x & probstr$X>y),]
XgxXgy <- nrow(XgxXgy.data)
#Number of observations X>x and Y>y
XgxYgy.data <- probstr[which(probstr$X>x & probstr$Y>y),]
XgxYgy <- nrow(XgxYgy.data)
#Number of observations where X<x and X>y
XlxXgy.dat <- probstr[which(probstr$X<=x & probstr$X>y),]
XlxXgy <- nrow(XlxXgy.dat)
#Number of observations where X>x and Y<y
XgxYly.dat <- probstr[which(probstr$X>x & probstr$Y<y),]
XgxYly <- nrow(XgxYly.dat)
#Number of observations where Xlx and Ygy
XlxYgy.dat <- probstr[which(probstr$X<x & probstr$Y>y),]
XlxYgy <- nrow(XlxYgy.dat)
#Number of observations where Xlx and Yly
XlxYly.dat <- probstr[which(probstr$X<x & probstr$Y<y),]
XlxYly <- nrow(XlxYly.dat)
#XlxYgy,XlxYly
#Number of observations where X>y
Xgy.dat <- probstr[which(probstr$X>y),]
Xgy <- nrow(Xgy.dat)
#Number of observations where X>x
Xgx.dat <- probstr[which(probstr$X>x),]
Xgx <-nrow(Xgx.dat)
#Number of observations where Y>y
Ygy.dat <- probstr[which(probstr$Y>y),]
Ygy <-nrow(Ygy.dat)
#Number of observations where X<x
Xlx.dat <- probstr[which(probstr$X<x),]
Xlx <-nrow(Xgx.dat)
#Number of observations where Y<y
Yly.dat <- probstr[which(probstr$Y<y),]
Yly <-nrow(Yly.dat)
#Number of observations where X<y
Xly.dat <- probstr[which(probstr$X<y),]
Xly <-nrow(Xly.dat)
#Number of observations where Y<y
Ygx.dat <- probstr[which(probstr$Y>x),]
Ygx <-nrow(Ygx.dat)
#Number of observations where Y<y
Ylx.dat <- probstr[which(probstr$Y<x),]
Ylx <-nrow(Ylx.dat)
#Calc 1
(length(X[X > x & X > y])/length(X)) / (length(X[X > y])/ length(X))
## [1] 0.5104124
#Calc 2
\(P(X>x|X>y)=\frac { P(X>x,X>y) }{ P(X>y) } =\frac { P(X>x)P(X>y) }{ P(X>y) }\)
This Reads as: There is 51% chance of X (uniform numbers) is greater than 3.59(aprox) given probability of Y (normal numbers) is greater than 1.11 (1st quantile of Y)
(length(X[X > x & Y > y])/length(X))
## [1] 0.3754
\(P(X>x,Y>y)= { P(X>x)(X>y)}\)
This reads: There is 37.5% chance that X is greater than 3.59 and Y probability is greater than 1.11
This is known as a joint probability
(length(X[X < x & X > y])/length(X)) / (length(X[X > y])/ length(X))
## [1] 0.4895876
This ReadsL There is 48% chance that X (uniform numbers) is less than 3.59(aprox) given probability of X is greater than 1.11 (1st quantile of Y)
\(P(X<x|X>y)=\frac { P(X<x,X>y) }{ P(X>y) } =\frac { P(X<x)P(X>y) }{ P(X>y) }\)
Investigate whether P(X>x and Y>y)=P(X>x)P(Y>y) by building a table and evaluating the marginal and joint probabilities.
#Frequency Table
cont.table <- matrix(c(Xgx,Ygx,Xlx,Ylx,Xgy,Ygy,Xly,Yly),nrow=2,ncol=4)
rownames(cont.table) <- c('X','Y')
kable(cont.table, digits = 4, col.names = c('>x','<x','>y','<y'), align = "l", caption = 'Observed Data Joint Frequencies')
>x | <x | >y | <y | |
---|---|---|---|---|
X | 5000 | 5000 | 9796 | 204 |
Y | 4972 | 5028 | 7500 | 2500 |
#Contingency Table
cont.table <- matrix(c(XgxYgy,XgxYly,XlxYgy,XlxYly),nrow=2,ncol=2)
rownames(cont.table) <- c('(X>x)','(X<x)')
kable(cont.table, digits = 4, col.names = c('(Y>y)','(Y<y)'), align = "l", caption = 'Observed Data Joint Probabilities')
(Y>y) | (Y<y) | |
---|---|---|
(X>x) | 3754 | 3746 |
(X<x) | 1246 | 1254 |
#P(X>x and Y>y)=P(X>x)P(Y>y)
#Calculating Joint Probs
Xgxp<-Xgx/total.obs
Ygyp<-Ygy/total.obs
Xlxp<-Xlx/total.obs
Ylyp<-Yly/total.obs
XgxYgyp<-round(XgxYgy/total.obs,3)
XgxYlyp<-round(XgxYly/total.obs,3)
XlxYgyp<-round(XgxYgy/total.obs,3)
XlxYlyp<-round(XgxYly/total.obs,3)
\(P(X>x and Y>y)=P(X>x)P(Y>y)\)
identical(XgxYgyp, Xgxp*Ygyp)
## [1] TRUE
XgxYgy/total.obs
## [1] 0.3754
Xgxp*Ygyp
## [1] 0.375
We can say that according to the validation and the test the Variables X and Y are independent let’s continue to do Chi Square and Fisher’s
Null Hypothesis (\(H_0\)): Probablity of X>x is not influenced by the probability of Y>y.
Alternative Hypothesis(\(H_A\)): Probablity of X>x has significant influence on the probability of Y>y.
#Chi Square Test
chisq.test(cont.table)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: cont.table
## X-squared = 0.026133, df = 1, p-value = 0.8716
#Fisher Test
fisher.test(cont.table)
##
## Fisher's Exact Test for Count Data
##
## data: cont.table
## p-value = 0.8716
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 0.9202847 1.1052820
## sample estimates:
## odds ratio
## 1.00857
Both P-Values are way bigger than the 0.5 threshold reaffirming that both variables are Independent and we can accept the Null Hypothesis
# Load training data from GitHub
path <- ('https://raw.githubusercontent.com/sortega7878/DATA605CSV/master/train.csv')
con <- file(path, open="r")
train <- read.csv(con, header=T, stringsAsFactors = F)
close(con)
# Load test data from GitHub
path <- ('https://raw.githubusercontent.com/sortega7878/DATA605CSV/master/test.csv')
con <- file(path, open="r")
test <- read.csv(con, header=T, stringsAsFactors = F)
close(con)
Pick two of the quantitative independent variables and one dependent variable from the training data set.
Looking at the mean and median values of each variable, as given by the summary
function, we can choose our right-skewed variable to represent “X” by using a predictor whose mean is larger than the median. The variables that appear to be the most right-skewed are LotArea
, TotalBsmtSF
, GrLivArea
, X1stFlrSF
, BsmtFinSF1
, MasVnrArea
, OverallCond
. LotArea
seems to be more skewed than the others, so this will be chosen as our X variable for the sake of the first analysis we will select GrLivArea
that is the Grade Above Living Area as second independent variable and will be defined as Z. SalePrice
is the dependent variable, and we will define it as Y.
X <- train$LotArea
Z <- train$GrLivArea
Y <- train$SalePrice
Provide univariate descriptive statistics and appropriate plots for the training data set.
Due to the number of variables (81), a table of descriptive statistics for the numerical variables has been created. The table displays the mean, minimum value, median, maximum value, IQR, standard deviation, and skewness, as computed by the same function in the e1071
package.
# number of variable types
table(sapply(train, class))
##
## character integer
## 43 38
Only numerical predictors were used, as there are 40+ character vectors. We could split up these character vectors by making binary categorical variables (1 for a value, 0 if not) for each value within the character vector, however this will result in many variables, most which probably won’t have great significance on predicting the Sale Price of the property in a later section when we need to create a model.
# subset training data numerical values
train_int <- train[c(2, 4, 5, 18:21, 27, 35, 37:39, 44:53, 55, 57, 60, 62:63, 67:72, 76:78, 81)]
means <- sapply(train_int, function(y) mean(y, na.rm = TRUE))
mins <- sapply(train_int, function(y) min(y, na.rm=TRUE))
medians <- sapply(train_int, function(y) median(y, na.rm = TRUE))
maxs <- sapply(train_int, function(y) max(y, na.rm=TRUE))
IQRs <- sapply(train_int, function(y) IQR(y, na.rm = TRUE))
SDs <- sapply(train_int, function(y) sd(y, na.rm = T))
skews <- sapply(train_int, function(y) skewness(y, na.rm = TRUE))
datasummary <- data.frame(means, mins, medians, maxs, IQRs, SDs, skews)
colnames(datasummary) <- c("MEAN", "MIN","MEDIAN", "MAX", "IQR", "STD. DEV", "SKEW")
datasummary <- round(datasummary, 2)
pander(datasummary)
MEAN | MIN | MEDIAN | MAX | IQR | STD. DEV | SKEW | |
---|---|---|---|---|---|---|---|
MSSubClass | 56.9 | 20 | 50 | 190 | 50 | 42.3 | 1.4 |
LotFrontage | 70.05 | 21 | 69 | 313 | 21 | 24.28 | 2.16 |
LotArea | 10517 | 1300 | 9478 | 215245 | 4048 | 9981 | 12.18 |
OverallQual | 6.1 | 1 | 6 | 10 | 2 | 1.38 | 0.22 |
OverallCond | 5.58 | 1 | 5 | 9 | 1 | 1.11 | 0.69 |
YearBuilt | 1971 | 1872 | 1973 | 2010 | 46 | 30.2 | -0.61 |
YearRemodAdd | 1985 | 1950 | 1994 | 2010 | 37 | 20.65 | -0.5 |
MasVnrArea | 103.7 | 0 | 0 | 1600 | 166 | 181.1 | 2.66 |
BsmtFinSF1 | 443.6 | 0 | 383.5 | 5644 | 712.2 | 456.1 | 1.68 |
BsmtFinSF2 | 46.55 | 0 | 0 | 1474 | 0 | 161.3 | 4.25 |
BsmtUnfSF | 567.2 | 0 | 477.5 | 2336 | 585 | 441.9 | 0.92 |
TotalBsmtSF | 1057 | 0 | 991.5 | 6110 | 502.5 | 438.7 | 1.52 |
X1stFlrSF | 1163 | 334 | 1087 | 4692 | 509.2 | 386.6 | 1.37 |
X2ndFlrSF | 347 | 0 | 0 | 2065 | 728 | 436.5 | 0.81 |
LowQualFinSF | 5.84 | 0 | 0 | 572 | 0 | 48.62 | 8.99 |
GrLivArea | 1515 | 334 | 1464 | 5642 | 647.2 | 525.5 | 1.36 |
BsmtFullBath | 0.43 | 0 | 0 | 3 | 1 | 0.52 | 0.59 |
BsmtHalfBath | 0.06 | 0 | 0 | 2 | 0 | 0.24 | 4.09 |
FullBath | 1.57 | 0 | 2 | 3 | 1 | 0.55 | 0.04 |
HalfBath | 0.38 | 0 | 0 | 2 | 1 | 0.5 | 0.67 |
BedroomAbvGr | 2.87 | 0 | 3 | 8 | 1 | 0.82 | 0.21 |
KitchenAbvGr | 1.05 | 0 | 1 | 3 | 0 | 0.22 | 4.48 |
TotRmsAbvGrd | 6.52 | 2 | 6 | 14 | 2 | 1.63 | 0.67 |
Fireplaces | 0.61 | 0 | 1 | 3 | 1 | 0.64 | 0.65 |
GarageYrBlt | 1979 | 1900 | 1980 | 2010 | 41 | 24.69 | -0.65 |
GarageCars | 1.77 | 0 | 2 | 4 | 1 | 0.75 | -0.34 |
GarageArea | 473 | 0 | 480 | 1418 | 241.5 | 213.8 | 0.18 |
WoodDeckSF | 94.24 | 0 | 0 | 857 | 168 | 125.3 | 1.54 |
OpenPorchSF | 46.66 | 0 | 25 | 547 | 68 | 66.26 | 2.36 |
EnclosedPorch | 21.95 | 0 | 0 | 552 | 0 | 61.12 | 3.08 |
X3SsnPorch | 3.41 | 0 | 0 | 508 | 0 | 29.32 | 10.28 |
ScreenPorch | 15.06 | 0 | 0 | 480 | 0 | 55.76 | 4.11 |
PoolArea | 2.76 | 0 | 0 | 738 | 0 | 40.18 | 14.8 |
MiscVal | 43.49 | 0 | 0 | 15500 | 0 | 496.1 | 24.43 |
MoSold | 6.32 | 1 | 6 | 12 | 3 | 2.7 | 0.21 |
YrSold | 2008 | 2006 | 2008 | 2010 | 2 | 1.33 | 0.1 |
SalePrice | 180921 | 34900 | 163000 | 755000 | 84025 | 79443 | 1.88 |
To illustrate the descriptive statistics in the numeric variables, box plots have been created below. These plots were used because they effectively show the spread and skewness of the data, the mean, as well as any outliers.
train_int_melted <- melt(train_int)
## No id variables; using all as measure variables
ggplot(train_int_melted, aes(variable, value)) + geom_boxplot(aes(fill = variable), alpha = 0.75, show.legend = FALSE) + facet_wrap(~variable, scale="free") + scale_y_continuous('') + scale_x_discrete('', breaks = NULL) + ggtitle("Distribution of Predictor and Target Variables\n")
## Warning: Removed 348 rows containing non-finite values (stat_boxplot).
Provide a scatterplot matrix for at least two of the independent variables and the dependent variable.
pairs(~LotArea+GrLivArea+SalePrice, data=train, main="2 Independent 1 dependent variable")
Looking at the above matrix scatterplot, there is a very light linear relationship between the lot size and sale price of the house. Most of the sale prices are concentrated between 100k and 300k, while the lot sizes have much less spread. The larger lot sizes do not necessarily belong to the most expensive properties, which is why we do not see a stronger correlation. In fact, there are a handful of cases where the lot size is really skewing the relationship between the variable and the sale price of the property. The second relationship with the Livable Area
shows a stronger linear approach and infering that’s easy not only from the graphic but common sense.
Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide an 80% confidence interval.Discuss the meaning of your analysis. Would you be worried about familywise error? Why or why not?
Testing for:
H0: ρ = 0 (“the population correlation coefficient is 0; there is no association”)
H1: ρ ≠ 0 (“the population correlation coefficient is not 0; a nonzero correlation could exist”)
print('values of XZ --r,p')
## [1] "values of XZ --r,p"
zct <- corr.test(X,Z, alpha=0.2, minlength = 5)
zct$r
## [1] 0.2631162
zct$p
## [1] 1.520481e-24
print('values of XY --r,p')
## [1] "values of XY --r,p"
yct <- corr.test(X,Y, alpha=0.2, minlength = 5)
yct$r
## [1] 0.2638434
yct$p
## [1] 1.123139e-24
FWER <- 1-(1-0.2)^2
FWER
## [1] 0.36
The table below shows the number of cases, mean, and standard deviation for the X and Y variables:
VAR | n | \(\bar{x}\) | \(s\) |
---|---|---|---|
X | 1460 | 10516.83 | 9981.265 |
Y | 1460 | 180921.2 | 79442.5 |
Z | 1460 | 1515 | 525.5 |
We can derive the point estimate (the difference in means) by subtracting the mean of Y from the mean of X and same thing in the paiwise XZ.
The difference in means XY (point estimate) = -170404.4 The difference in means XZ (point estimate) = 9001.83
To get the standard error (\(SE\)) of the point estimate, we take the sum of the variance divided by the sample size, and take the square root of that value:
# Standard Error of Point Estimate
XYSE <- sqrt((var(X) / length(X)) + (var(Y) / length(Y)))
XZSE <- sqrt((var(X) / length(X)) + (var(Z) / length(Z)))
XYSE = 2095.451
XZSE = 261.58
The T-score is the point estimate divided by the standard error:
TXY = -170404.4 / 2095.451 = -81.321
TXZ = 9001.83 / 261.58 = 34.41
The confidence level for the difference in means can be calculated by taking the point estimate \(\pm\) the t-value associated with the degrees of freedom \(\times\) the standard error. Here we will use R’s t.test
function to calculate the 80% confidence level that the difference in means is not zero:
t.test(X, Y, conf.level = 0.8)
##
## Welch Two Sample t-test
##
## data: X and Y
## t = -81.321, df = 1505.1, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 80 percent confidence interval:
## -173091.0 -167717.8
## sample estimates:
## mean of x mean of y
## 10516.83 180921.20
t.test(X, Z, conf.level = 0.8)
##
## Welch Two Sample t-test
##
## data: X and Z
## t = 34.411, df = 1467.1, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 80 percent confidence interval:
## 8665.981 9336.748
## sample estimates:
## mean of x mean of y
## 10516.828 1515.464
From the output, the 80% confidence interval that the true difference in means lies between -173091.0 and -167717.8. The results are negative because the mean of Y is much larger than the mean of X.
And for the XZ Relationship there’s an 80% confidence interval that the true difference in means lies between 8665.981 and 9336.748
Both pairwise variables show positive relationship and very small p-values so we should reject the Null Hypothesis. Possibility of FWER is going to be high since we’re only executing a single experiment so probability wil be high however not meaningful for the experiments we’re solving here. R -Value is positive, P-Value is a very small value in both cases enough to reject the null hypothesis with confidence.
This is known as the precision matrix and contains variance inflation factors on the diagonal.
XY <- data.frame(X, Y)
cor_matrix <- cor(XY)
cor_matrix
## X Y
## X 1.0000000 0.2638434
## Y 0.2638434 1.0000000
prec_matrix <- solve(cor_matrix)
prec_matrix
## X Y
## X 1.0748219 -0.2835846
## Y -0.2835846 1.0748219
This inverted correlation matrix has variance inflation factors on the diagonal, which are a function of how closely one variable is a linear function of the other variable.
Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix.
cor_matrix %*% prec_matrix
## X Y
## X 1 0
## Y 0 1
prec_matrix %*% cor_matrix
## X Y
## X 1 0
## Y 0 1
Since the precision matrix is the inverse of the correlation matrix, multiplying the two together will result in the identity matrix, which we see in the resulting \(2 \times 2\) matrix with 1s on the diagonal.
In the above process, we switched the order of multiplying the correlation and the precision matrix, both resulting in an identity matrix. This is proving that the correlation matrix is invertible, since
\(A^{-1}A = I\) and \(AA^{-1} = I\)
LuA <- lu.decomposition(cor_matrix)
L <- LuA$L
U <- LuA$U
L
## [,1] [,2]
## [1,] 1.0000000 0
## [2,] 0.2638434 1
U
## [,1] [,2]
## [1,] 1 0.2638434
## [2,] 0 0.9303867
Multiplying the descomposed matrix to obtain original and for proof
#Proof
proof <- LuA$L%*%LuA$U
proof
## [,1] [,2]
## [1,] 1.0000000 0.2638434
## [2,] 0.2638434 1.0000000
cor_matrix
## X Y
## X 1.0000000 0.2638434
## Y 0.2638434 1.0000000
For your variable that is skewed to the right, shift it so that the minimum value is above zero.
The minimum value of X (LotArea
) is skewed to the right, but the lowest value is already above zero, as no negative lot sizes exist in the data set. In fact, none of the numerical variables have any values below 0, as indicated in the table in the descirptive statistics section.
min(X)
## [1] 1300
Load the MASS package and run fitdistr
to fit an exponential probability density function.
(See https://stat.ethz.ch/R-manual/R-devel/library/MASS/html/fitdistr.html).
expdf <- fitdistr(X, densfun="exponential")
Find the optimal value of $\lambda$ for this distribution.
In many examples, the exponential distribution is used to explain the probability of events occurring over a period of time (time elapsed between customers, weather events, phone calls, etc.). The probability of the even occurring is proportional to the length of the interval. In other words, if an event happens six times in an hour, \(\lambda = 6\), and the optimal value of \(\lambda\) would be the the average wait time (\(\frac{1}{\lambda}\), or 10 minutes), or the expected value.
For this distribution, a skewed variable of lot sizes in properties for sale was used. Instead of moments in time, \(\lambda\) is representing the increase in size of the property’s lot area.
# get value of lambda from exponential distribution
lambda <- expdf$estimate
# expected value of lambda
rate <- 1 / lambda
rate
## rate
## 10516.83
We can see that this is the same as the mean given in the descriptive statistics for the X variable (LotArea
). Also, the value of \(\lambda\) is equal to one divided by this optimal value of lambda (\(\frac{1}{1056.83}\)).
Then, take 1000 samples from this exponential distribution using this value.
(e.g., rexp(1000, some_val
))
# 1000 samples from expon. dist. using lambda
expdf_samp <- rexp(1000, lambda)
Plot a histogram and compare it with a histogram of your original variable.
A histogram of the exponential fit of the original X variable (LotArea
) is below:
hist(expdf_samp, col="royalblue", main="Histogram of Exponential Fit of X")
Here we see that the histogram follows the pattern of an exponential pdf; the highest count of values is first, and successively gets smaller with each bin. The distribution is right-skewed, but evenly.
Below is a histogram of the original X variable, without the exponential fit:
hist(X, col="royalblue", main="Histogram of X Variable", breaks=20)
The histogram of the original X variable is right-skewed, but much more so than the exponential fit of X. Looking at summary statistics supports illustrates the difference in the spread of the data.
Summary Stats of Exponential Fit of X:
summary(expdf_samp)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 7.29 3288.17 7276.83 10230.06 13992.43 76516.83
Summary Stats of X:
summary(X)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1300 7554 9478 10517 11602 215245
The values of the observations of X are much more spread out, with the difference between the minimum and maximum value being 213945, but only 61169.37 in the exponential fit of X.
Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function (CDF).
# 5 and 95 percentile of exponential pdf
qexp(c(.05, .95), rate = lambda)
## [1] 539.4428 31505.6013
Generate a 95% confidence interval from the empirical data, assuming normality
Assuming normality (normally distributed) for the originally observed data for X, another of R’s distribution functions (rbinom
, rpois
, rexp
, etc.) can be used to generate the 95% confidence interval. Unlike the problem above, we are not looking for the 5th and 95th percentiles, but the 95% ci (0.05 / 2, because of two-tails).
qnorm(c(.025, .975), mean=mean(X), sd=sd(X))
## [1] -9046.092 30079.748
If the data for our X variable were normally distributed, the true value of the mean would lie between -9046.1 and 30079.748 in 95% of the sample intervals taken.
Finally, provide the empirical 5th percentile and 95th percentile of the data. Discuss.
As performed in the first (probability) section, the 3rd quartile of X and 2nd quartile of Y were calculated using the quantile
function in R. The same can be done to provide the 5th and 95th percentile of the observed values of X.
# 5th and 95th percentiles of empirical data X var
quantile(X, c(.05, .95))
## 5% 95%
## 3311.70 17401.15
5% of the values will be equal to 3311.70 or less, and 5% of the values will be equal to or greater than 17401.15. 90% of the values will be in-between these two percentiles, so we essentially have a 90% confidence interval.
In the descriptive statistics section, the number of categorical variables was found to be over 40. In keeping things somewhat simple, the subset of the train
dataset’s numerical variables (minus the Id
variable) is used for creating a predictive model.
First, a linear model using all of the numerical predictors is used:
model0 <- glm(SalePrice ~ ., data=train_int)
full_mod_summ <- summary(model0)
From the summary output, there are a number of predictor variables that have been flagged as highly significant (***
). Let’s build a model using these highly-significant predictors:
# AIC 34834
model1 <- glm(SalePrice ~ MSSubClass + LotArea + OverallQual + OverallCond + YearBuilt + X1stFlrSF + X2ndFlrSF + BedroomAbvGr + GarageCars, data=train_int)
summary(model1)
##
## Call:
## glm(formula = SalePrice ~ MSSubClass + LotArea + OverallQual +
## OverallCond + YearBuilt + X1stFlrSF + X2ndFlrSF + BedroomAbvGr +
## GarageCars, data = train_int)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -466869 -18409 -2319 15022 274328
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -993242.4309 88871.5603 -11.176 < 2e-16 ***
## MSSubClass -171.4795 24.9561 -6.871 0.00000000000942 ***
## LotArea 0.5727 0.1025 5.585 0.00000002792578 ***
## OverallQual 19990.2071 1129.8259 17.693 < 2e-16 ***
## OverallCond 5975.4051 953.9547 6.264 0.00000000049402 ***
## YearBuilt 463.8347 45.4750 10.200 < 2e-16 ***
## X1stFlrSF 80.7214 3.6522 22.102 < 2e-16 ***
## X2ndFlrSF 63.0789 3.3690 18.723 < 2e-16 ***
## BedroomAbvGr -9771.5864 1487.0266 -6.571 0.00000000006940 ***
## GarageCars 11649.8427 1760.5266 6.617 0.00000000005135 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 1336024492)
##
## Null deviance: 9207911334610 on 1459 degrees of freedom
## Residual deviance: 1937235513612 on 1450 degrees of freedom
## AIC: 34834
##
## Number of Fisher Scoring iterations: 2
Another model is built using a log transformation on the LotArea
variable, and replacing the X1stFlrSF
, X2ndFlrSF
, and BedroomAbvGr
with GrLivArea
and Fireplaces
:
# AIC 34891
model2 <- glm(SalePrice ~ MSSubClass + log(LotArea) + OverallQual + OverallCond + YearBuilt + GrLivArea + GarageCars + Fireplaces, data=train_int)
summary(model2)
##
## Call:
## glm(formula = SalePrice ~ MSSubClass + log(LotArea) + OverallQual +
## OverallCond + YearBuilt + GrLivArea + GarageCars + Fireplaces,
## data = train_int)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -380822 -19759 -1797 14676 295224
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1246905.797 95127.686 -13.108 < 2e-16 ***
## MSSubClass -126.766 27.534 -4.604 0.00000450588954 ***
## log(LotArea) 15648.989 2509.673 6.235 0.00000000058944 ***
## OverallQual 21537.900 1146.236 18.790 < 2e-16 ***
## OverallCond 5541.537 968.840 5.720 0.00000001293403 ***
## YearBuilt 520.955 46.726 11.149 < 2e-16 ***
## GrLivArea 50.321 2.748 18.310 < 2e-16 ***
## GarageCars 12490.180 1797.504 6.949 0.00000000000555 ***
## Fireplaces 8041.879 1768.696 4.547 0.00000590023871 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 1389925390)
##
## Null deviance: 9207911334610 on 1459 degrees of freedom
## Residual deviance: 2016781741307 on 1451 degrees of freedom
## AIC: 34891
##
## Number of Fisher Scoring iterations: 2
Lastly, a model using only a few predictors is built:
# AIC 35003
model3 <- glm(log(SalePrice) ~ log(LotArea) + OverallQual + YearBuilt + GrLivArea, data=train_int)
summary(model3)
##
## Call:
## glm(formula = log(SalePrice) ~ log(LotArea) + OverallQual + YearBuilt +
## GrLivArea, data = train_int)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.92463 -0.07710 0.01250 0.09524 0.54073
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.64993531 0.36583013 7.244 7.06e-13 ***
## log(LotArea) 0.14942513 0.00952403 15.689 < 2e-16 ***
## OverallQual 0.13038058 0.00496359 26.267 < 2e-16 ***
## YearBuilt 0.00348343 0.00018658 18.670 < 2e-16 ***
## GrLivArea 0.00023143 0.00001167 19.829 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.029731)
##
## Null deviance: 232.801 on 1459 degrees of freedom
## Residual deviance: 43.259 on 1455 degrees of freedom
## AIC: -982.43
##
## Number of Fisher Scoring iterations: 2
Looking at the coefficients for the predictor variables, we can see they all have a positive estimate, though all are very small values. This makes sense, since we can generally assume that the Lot size, Quality, year a property was built, and square footage of the home would have a positive relationships to the price (i.e. large, well-built, new houses on lots of land cost more).
The biggest difference for this model is transforming the response variable, since it is also skewed, but because of its simplicity, we’ll use this for predicting the Sale Price of the properties in the Kaggle dataset. Using R’s predict
function, we’ll feed it the third reduced model, the test dataset, and store the results in an object.
predict_price <- predict(model3, test, type="response")
Taking this resulting vector, we’ll bind it to the Id
column of the test data, then create a .csv from the data.
results <- data.frame(test$Id, exp(predict_price)) # use exp to transform the log of the response variable back
colnames(results) <- c("Id", "SalePrice")
head(results)
## Id SalePrice
## 1 1461 125357.1
## 2 1462 161087.0
## 3 1463 172810.9
## 4 1464 187072.4
## 5 1465 198994.6
## 6 1466 186086.3
# Write to .csv for submission to Kaggle
#write.csv(results, file = "predicted_prices_sergio.csv", row.names = FALSE) # not for rendering R Markdown
My Kaggle user name is Sortega78, and the resulting score on Kaggle.com from this model is 0.16805.
knitr::include_graphics("C:\\Users\\sergioor\\Desktop\\Old PC\\CUNY\\DATA605\\DATA605_Final-master\\kagglesergio.png")