Hari Krishnan S3797479
Last updated: 31 May, 2019
RPubs (see here)
Rpubs link comes here: http://rpubs.com/harisr/SimpleLinearRegressionPredictHousePrices
y=α+βx+ϵ
where y is the dependent variable
α is the constant/intercept
β is the slope
x is the predictor and
ϵ is the random error/residuals.
## Import the data
House_Data <- read_csv("kc_house_data.csv")
head(House_Data,10)x <- boxplot(House_Data$price)outliers<-x$out
# Remove rows containing the outliers
House_Data <- House_Data[-which(House_Data$price %in% outliers),] House_Data %>% summarise(Min = min(price,na.rm = TRUE),
Q1 = quantile(price,probs = .25,na.rm = TRUE),
Median = median(price, na.rm = TRUE),
Q3 = quantile(price,probs = .75,na.rm = TRUE),
Max = max(price,na.rm = TRUE),
Mean = mean(price, na.rm = TRUE),
SD = sd(price, na.rm = TRUE),
n = n(),
Missing = sum(is.na(price))) -> table1
knitr::kable(table1)| Min | Q1 | Median | Q3 | Max | Mean | SD | n | Missing |
|---|---|---|---|---|---|---|---|---|
| 75000 | 315000 | 437500 | 6e+05 | 1127500 | 476984.6 | 208371.3 | 20467 | 0 |
House_Data %>% summarise(Min = min(sqft_living,na.rm = TRUE),
Q1 = quantile(sqft_living,probs = .25,na.rm = TRUE),
Median = median(sqft_living, na.rm = TRUE),
Q3 = quantile(sqft_living,probs = .75,na.rm = TRUE),
Max = max(sqft_living,na.rm = TRUE),
Mean = mean(sqft_living, na.rm = TRUE),
SD = sd(sqft_living, na.rm = TRUE),
n = n(),
Missing = sum(is.na(sqft_living))) -> table1
knitr::kable(table1)| Min | Q1 | Median | Q3 | Max | Mean | SD | n | Missing |
|---|---|---|---|---|---|---|---|---|
| 290 | 1400 | 1860 | 2431 | 7480 | 1975.558 | 774.8335 | 20467 | 0 |
plot(price ~ sqft_living, data = House_Data)par(mfrow=c(2,2))
House_Data$price %>% hist(main = "House Price")
log(House_Data$price) %>% hist(main = "log(House Price)")
House_Data$sqft_living %>% hist(main = "sqft living Area")
log(House_Data$sqft_living) %>% hist(main = "log(sqft living Area)")par(mfrow=c(1,1))plot(log(price) ~ log(sqft_living), data = House_Data)model1 <- lm(log(price) ~ log(sqft_living), data = House_Data)
model1 %>% summary()##
## Call:
## lm(formula = log(price) ~ log(sqft_living), data = House_Data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.17058 -0.27537 0.03114 0.26054 1.10186
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.876884 0.046980 167.7 <2e-16 ***
## log(sqft_living) 0.679216 0.006245 108.8 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3551 on 20465 degrees of freedom
## Multiple R-squared: 0.3663, Adjusted R-squared: 0.3662
## F-statistic: 1.183e+04 on 1 and 20465 DF, p-value: < 2.2e-16
model1 %>% confint() ## 2.5 % 97.5 %
## (Intercept) 7.7847985 7.968969
## log(sqft_living) 0.6669743 0.691457
H0 : α = 0 HA : α ≠0
H0 : β = 0 HA : β ≠0
The best line fit: log(price)= 7.88 + 0.68 * log(sqft_living)
model1 %>% plot(which = 1) model1 %>% plot(which = 2) model1 %>% plot(which = 3) model1 %>% plot(which = 5) library(psychometric)
r=cor(log(House_Data$price),log(House_Data$sqft_living))
CIr(r = r, n = 20467, level = .95)## [1] 0.5964446 0.6138103