Email          : albert.prayogo99@gmail.com
RPubs         : https://rpubs.com/albert23899
Jurusan      : Statistika
Address     : ARA Center, Matana University Tower
             Jl. CBD Barat Kav, RT.1, Curug Sangereng, Kelapa Dua, Tangerang, Banten 15810.
Impor paket yang diperlukan
library(MASS)
library(dplyr)##
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
library(lubridate)##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(splines)
library(mgcv)## Loading required package: nlme
##
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
##
## collapse
## This is mgcv 1.8-38. For overview type 'help("mgcv-package")'.
(WD<-getwd())## [1] "F:/Matana/Semester 2/Komputasi Statistika/Tugas 4"
if(!is.null(WD))setwd(WD)
rlung<-read.csv(file.path(WD,'data','LungDisease.csv'))
rhouse<-read.csv(file.path(WD,'data','house_sales.csv'),sep='\t')
head(rlung)## PEFR Exposure
## 1 390 0
## 2 410 0
## 3 430 0
## 4 460 0
## 5 420 1
## 6 280 2
head(rhouse)## DocumentDate SalePrice PropertyID PropertyType ym zhvi_px zhvi_idx
## 1 2014-09-16 280000 1000102 Multiplex 2014-09-01 405100 0.9308364
## 2 2006-06-16 1000000 1200013 Single Family 2006-06-01 404400 0.9292279
## 3 2007-01-29 745000 1200019 Single Family 2007-01-01 425600 0.9779412
## 4 2008-02-25 425000 2800016 Single Family 2008-02-01 418400 0.9613971
## 5 2013-03-29 240000 2800024 Single Family 2013-03-01 351600 0.8079044
## 6 2009-03-30 349900 3600090 Townhouse 2009-03-01 369800 0.8497243
## AdjSalePrice NbrLivingUnits SqFtLot SqFtTotLiving SqFtFinBasement Bathrooms
## 1 300805 2 9373 2400 0 3.00
## 2 1076162 1 20156 3764 1452 3.75
## 3 761805 1 26036 2060 900 1.75
## 4 442065 1 8618 3200 1640 3.75
## 5 297065 1 8620 1720 0 1.75
## 6 411781 1 1012 930 0 1.50
## Bedrooms BldgGrade YrBuilt YrRenovated TrafficNoise LandVal ImpsVal ZipCode
## 1 6 7 1991 0 0 70000 229000 98002
## 2 4 10 2005 0 0 203000 590000 98166
## 3 4 8 1947 0 0 183000 275000 98166
## 4 5 7 1966 0 0 104000 229000 98168
## 5 4 7 1948 0 0 104000 205000 98168
## 6 2 8 2008 0 0 170000 207000 98144
## NewConstruction
## 1 FALSE
## 2 TRUE
## 3 FALSE
## 4 FALSE
## 5 FALSE
## 6 TRUE
#Regresi Linear Sederhana Korelasi
plot(rlung$Exposure, rlung$PEFR, xlab="Exposure", ylab="PEFR") Persamaan Regresi
rmodel<-lm(PEFR~Exposure,data=rlung)
summary(rmodel)##
## Call:
## lm(formula = PEFR ~ Exposure, data = rlung)
##
## Residuals:
## Min 1Q Median 3Q Max
## -297.845 -58.783 -1.214 61.024 209.109
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 424.583 20.796 20.417 < 2e-16 ***
## Exposure -4.185 1.325 -3.158 0.00201 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 101.4 on 120 degrees of freedom
## Multiple R-squared: 0.07674, Adjusted R-squared: 0.06905
## F-statistic: 9.974 on 1 and 120 DF, p-value: 0.002008
Visualisasi Intersepsi Regresi
plot(rlung$Exposure, rlung$PEFR, xlab="Exposure", ylab="PEFR", ylim=c(300,450), type="n", xaxs="i")
abline(a=rmodel$coefficients[1], b=rmodel$coefficients[2], col="blue", lwd=2)
text(x=.3, y=rmodel$coefficients[1], labels=expression("b"[0]), adj=0, cex=1.5)
x<- c(7.5,17.5)
y<- predict(rmodel, newdata=data.frame(Exposure=x))
segments(x[1],y[2], x[2], y[2],col="red",lwd=2,lty=2)
segments(x[1],y[1], x[1], y[2],col="red",lwd=2,lty=2)
text(x[1],mean(y), labels=expression(Delta~Y),pos=2,cex=1.5)
text(mean(x),y[2], labels=expression(Delta~X),pos=1,cex=1.5)
text(mean(x),400, labels=expression(b[1]==frac(Delta ~ Y,Delta~ X)),cex=1.5) Fit Model Residual & Plot
fitted<-predict(rmodel)
resid<-residuals(rmodel)
fit1<-rlung %>%
mutate(Fitted=fitted,
positive=PEFR>Fitted)%>%
group_by(Exposure, positive)%>%
summarize(PEFR_max = max(PEFR),
PEFR_min = min(PEFR),
Fitted= first(Fitted))%>%
ungroup()%>%
mutate(PEFR= ifelse(positive,PEFR_max,PEFR_min))%>%
arrange(Exposure)## `summarise()` has grouped output by 'Exposure'. You can override using the `.groups` argument.
plot(rlung$Exposure, rlung$PEFR, xlab="Exposure", ylab="PEFR")
abline(a=rmodel$coefficients[1],b=rmodel$coefficients[2], col="blue", lwd=2)
segments(fit1$Exposure, fit1$PEFR, fit1$Fitted, col="red",lty=3) Diagnosis Akurasi Normalitas
library(Lahman)
library(tidyverse)## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v tibble 3.1.6 v stringr 1.4.0
## v readr 2.1.0 v forcats 0.5.1
## v purrr 0.3.4
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x lubridate::as.difftime() masks base::as.difftime()
## x nlme::collapse() masks dplyr::collapse()
## x lubridate::date() masks base::date()
## x dplyr::filter() masks stats::filter()
## x lubridate::intersect() masks base::intersect()
## x dplyr::lag() masks stats::lag()
## x dplyr::select() masks MASS::select()
## x lubridate::setdiff() masks base::setdiff()
## x lubridate::union() masks base::union()
library(skimr)
library(inspectdf)
library(janitor)##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(ggrepel)
library(qqplotr)##
## Attaching package: 'qqplotr'
## The following objects are masked from 'package:ggplot2':
##
## stat_qq_line, StatQqLine
ggNormQQPlot <- rlung %>%
# name the 'sample' the outcome variable
ggplot(mapping= aes(sample=PEFR))+
# add the stat_qq_band
qqplotr::stat_qq_band(
bandType = "pointwise",
mapping= aes(fill="Normal"),alpha= 0.5,
show.legend = FALSE
)+
#add the lines
qqplotr::stat_qq_line()+
#add the points
qqplotr::stat_qq_point()+
#add labs
ggplot2::labs(
x= "Kuantil Teoretis",
y= "Residual Sampel",
title ="Q-Q plot Kuantil Teoretis Regresinya"
)
ggNormQQPlot Verifikasi Normalitas
Augmt_r<-broom::augment_columns(rmodel, data= rlung)
ggHistNormResid<- Augmt_r %>%
ggplot2::ggplot(aes(x =.resid)) +
ggplot2::geom_histogram(aes(y=..density..),
colour="darkred",
fill="firebrick",
alpha=0.3,
bins=30
) +
ggplot2::stat_function(
fun=dnorm,
args =list(
mean=mean(Augmt_r$.resid,na.rm =TRUE),
sd= sd(Augmt_r$.resid,na.rm=TRUE)
),
color="darkblue",
size=1
) +
ggplot2::labs(
x= "Residual",
y= "Densitas",
title="Residual vs. Distribusi Normal",
subtitle= "Regresi Linear"
)
ggHistNormResid Linearitas
# Mendapatkan Outlier untuk data di NonNormResidsRunWins
NonNormResid<-Augmt_r %>%
dplyr::arrange(dplyr::desc(abs(.resid)))%>%
head(5)ggResidVsFit <- Augmt_r %>%
ggplot2::ggplot(aes(
x=.fitted,
y=.resid
)) +
#add the points
ggplot2::geom_point(show.legend= FALSE)+
# add the line for the points
ggplot2::stat_smooth(
method="loess",
color="darkblue",
show.legend = FALSE
) +
#add the line for the zero intercepts
ggplot2::geom_hline(
#add y intercept
yintercept = 0,
#add color
color="darkred",
#add line type
linetype="dashed"
) +
#add points for the outliers
ggplot2::geom_point(
data= NonNormResid,
aes(
color="darkred",
size=.fitted
),
show.legend=FALSE
)+
#add text labels
ggrepel::geom_text_repel(
data=NonNormResid,
color="darkblue",
aes(label=base::paste(
NonNormResid$Exposure,
NonNormResid$PEFR
)),
show.legend = FALSE
)+
ggplot2::labs(
x="Fitted values",
y="Residuals",
title="Residual Vs Fitted Using Baseball Data"
)
ggResidVsFit## Warning: Use of `NonNormResid$Exposure` is discouraged. Use `Exposure` instead.
## Warning: Use of `NonNormResid$PEFR` is discouraged. Use `PEFR` instead.
## `geom_smooth()` using formula 'y ~ x'
Homogenitas Varians
ggScaleVSLoc<-Augmt_r %>%
#here we plot the fitted and we get the square root of the absolute value
#of the std.resic
ggplot2::ggplot(aes(
x=.fitted,
y=sqrt(abs(.std.resid))
)) +
#add the points
ggplot2::geom_point(na.rm = TRUE)+
#add stat smooth
stat_smooth(
method="loess",
color="darkred",
na.rm = TRUE
)+
#add the labs
ggplot2::labs(
x="Nilai Penyesuaian (Fitted Values)",
y=expression(sqrt("|Standardized residuals|")),
title="Skala Residual"
)
ggScaleVSLoc## `geom_smooth()` using formula 'y ~ x'
Residual Vs Leverage
Cooks<-Augmt_r %>%
dplyr::filter(.cooksd>=1)
Cooks## # A tibble: 0 x 9
## # ... with 9 variables: PEFR <int>, Exposure <int>, .fitted <dbl>,
## # .se.fit <dbl>, .resid <dbl>, .hat <dbl>, .sigma <dbl>, .cooksd <dbl>,
## # .std.resid <dbl>
Top5Cooks<-Augmt_r%>%
dplyr::mutate(index_cooksd=seq_along(.cooksd))%>%
dplyr::arrange(desc(.cooksd))%>%
utils::head(5)
Top5Cooks %>% utils::head()## # A tibble: 5 x 10
## PEFR Exposure .fitted .se.fit .resid .hat .sigma .cooksd .std.resid
## <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 110 4 408. 16.2 -298. 0.0255 98.0 0.116 -2.97
## 2 610 3 412. 17.3 198. 0.0291 100. 0.0589 1.98
## 3 590 3 412. 17.3 178. 0.0291 101. 0.0476 1.78
## 4 200 6 399. 14.1 -199. 0.0193 100. 0.0389 -1.99
## 5 280 2 416. 18.5 -136. 0.0331 101. 0.0319 -1.37
## # ... with 1 more variable: index_cooksd <int>
AugNormCook<-Augmt_r %>%
dplyr::mutate(index_cooksd=seq_along(.cooksd))
#Plot The New Variable
ggCooksDistance<-AugNormCook %>%
ggplot2::ggplot(
data=.,
#this goes on the x axis
mapping=aes(
x=index_cooksd,
#Cook's D go on the y
y=.cooksd,
#the minimum always 0
ymin=0,
#the max is the max value for .cooksd
ymax=.cooksd
)
) +
#add the points with a size 0.8
ggplot2::geom_point(
size=1.7,
alpha=4/5,
) +
# and the linegrange from 0 to y, to the point
# on the y axis
ggplot2::geom_linerange(
size=0.3,
alpha=2/3
) +
#These are the labels for outliers
ggrepel::geom_label_repel(
data=Top5Cooks,
aes(
label = Top5Cooks[[".cooksd"]],
#move these over a tad
nudge_x=3,
#color this with something nice
color="darkred"
),
show.legend = FALSE
) +
#set the ylim (ylimits) for 0 and the max value for .cooksd
ggplot2::ylim(0,
max(Augmt_r$.cooksd,na.rm=TRUE))+
#add the labs for the last
ggplot2::labs(
x="Nomor Pengamatan",
y="Jarak Cook",
title="Regresi Linear"
)## Warning: Ignoring unknown aesthetics: nudge_x
ggCooksDistance## Warning: Use of `Top5Cooks[[".cooksd"]]` is discouraged. Use
## `.data[[".cooksd"]]` instead.
Top5NonNormResid<-Augmt_r %>%
dplyr::mutate(index_cooksd=seq_along(.cooksd))%>%
dplyr::arrange(dplyr::desc(abs(.resid))) %>%
head(5)
#add the residuals
ggCooksDistance +
ggrepel::geom_label_repel(
data=Top5NonNormResid,
aes(label=base::paste(
Top5NonNormResid$Exposure,
Top5NonNormResid$PEFR
)),
fill="darkred",
nudge_x = 5,
segment.color="darkred",
color="ivory",
#remove the legend
show.legend = FALSE
)## Warning: Use of `Top5Cooks[[".cooksd"]]` is discouraged. Use
## `.data[[".cooksd"]]` instead.
## Warning: Use of `Top5NonNormResid$Exposure` is discouraged. Use `Exposure`
## instead.
## Warning: Use of `Top5NonNormResid$PEFR` is discouraged. Use `PEFR` instead.
ggResidLevPlot<-Augmt_r %>%
ggplot2::ggplot(
data=.,
aes(
x=.hat,
y=.std.resid
)
) +
ggplot2::geom_point()+
ggplot2::stat_smooth(
method="loess",
color="darkblue",
na.rm=TRUE
)+
ggrepel::geom_label_repel(
data=Top5Cooks,
mapping=aes(
x=Top5Cooks$.hat,
y=Top5Cooks$.std.resid,
label=base::paste(
Top5NonNormResid$Exposure,
Top5NonNormResid$PEFR
)
),
label.size=0.15
) +
ggplot2::geom_point(
data=Top5Cooks,
mapping=aes(
x=Top5Cooks$.hat,
y=Top5Cooks$.std.resid
),
show.legend=FALSE
)+
ggplot2::labs(
x="Leverage",
y="Standardized Residuals",
title="Residual Vs Leverage Regresi Linear"
) +
scale_size_continuous("Cook's Distance", range=c(1,5)) +
theme(legend.position = "bottom")
ggResidLevPlot## Warning: Use of `Top5Cooks$.hat` is discouraged. Use `.hat` instead.
## Warning: Use of `Top5Cooks$.std.resid` is discouraged. Use `.std.resid` instead.
## Warning: Use of `Top5Cooks$.hat` is discouraged. Use `.hat` instead.
## Warning: Use of `Top5Cooks$.std.resid` is discouraged. Use `.std.resid` instead.
## `geom_smooth()` using formula 'y ~ x'
library(lindia)
lindia::gg_diagnose(rmodel)## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
#Memprediksi Model Linear Split Data
#setting seed to reproduce results of random sampling
set.seed(100) #row indices for training data
trainingRowIndex<-sample(1:nrow(rlung), 0.8*nrow(rlung))
trainingData<-rlung[trainingRowIndex,] #training data
testdata<-rlung[-trainingRowIndex,] #test dataSesuaikan model data pada pelatihan dan prediksi
lmMod<-lm(PEFR ~ Exposure, data= trainingData) #build the model
Pred<-predict(lmMod,testdata) #Predict DistanceTinjauan tindakan diagnostik
summary(lmMod)##
## Call:
## lm(formula = PEFR ~ Exposure, data = trainingData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -289.708 -54.587 8.064 59.088 209.088
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 414.407 23.911 17.331 <2e-16 ***
## Exposure -3.675 1.522 -2.414 0.0177 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 100.2 on 95 degrees of freedom
## Multiple R-squared: 0.05782, Adjusted R-squared: 0.0479
## F-statistic: 5.83 on 1 and 95 DF, p-value: 0.01767
Hitung Akurasi Prediksi dan Tingkat Kesalahan
actuals_preds<-data.frame(cbind(actuals=testdata$PEFR, predicteds=Pred))
correlation_accuracy<-cor(actuals_preds)
head(actuals_preds)## actuals predicteds
## 1 390 414.4073
## 6 280 407.0577
## 8 520 407.0577
## 10 590 403.3830
## 13 360 403.3830
## 17 400 399.7082
#Min-Max Accuracy Calculation
min_max_accuracy<-mean(apply(actuals_preds,1,min)/apply(actuals_preds,1,max))
#MAPE Calculation
mape<-mean(abs((actuals_preds$predicteds-actuals_preds$actuals))/actuals_preds$actuals)Menghitung semua metrik kesalahan
library(DMwR)## Loading required package: lattice
## Loading required package: grid
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
DMwR::regr.eval(actuals_preds$actuals,actuals_preds$predicteds)## mae mse rmse mape
## 9.010079e+01 1.132387e+04 1.064137e+02 2.710465e-01
#Regresi Linear Majemuk Menilai Model
print(head(rhouse[,c('AdjSalePrice','SqFtTotLiving','SqFtLot','Bathrooms','Bedrooms','BldgGrade')]))## AdjSalePrice SqFtTotLiving SqFtLot Bathrooms Bedrooms BldgGrade
## 1 300805 2400 9373 3.00 6 7
## 2 1076162 3764 20156 3.75 4 10
## 3 761805 2060 26036 1.75 4 8
## 4 442065 3200 8618 3.75 5 7
## 5 297065 1720 8620 1.75 4 7
## 6 411781 930 1012 1.50 2 8
house_lm<-lm(AdjSalePrice~SqFtTotLiving+SqFtTotLiving+Bathrooms+Bedrooms+BldgGrade,data=rhouse,na.action=na.omit)
summary(house_lm)##
## Call:
## lm(formula = AdjSalePrice ~ SqFtTotLiving + SqFtTotLiving + Bathrooms +
## Bedrooms + BldgGrade, data = rhouse, na.action = na.omit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1197924 -118611 -21138 87577 9473072
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.224e+05 1.564e+04 -33.391 < 2e-16 ***
## SqFtTotLiving 2.282e+02 3.851e+00 59.267 < 2e-16 ***
## Bathrooms -1.925e+04 3.620e+03 -5.317 1.07e-07 ***
## Bedrooms -4.764e+04 2.486e+03 -19.161 < 2e-16 ***
## BldgGrade 1.061e+05 2.396e+03 44.287 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 261300 on 22682 degrees of freedom
## Multiple R-squared: 0.5406, Adjusted R-squared: 0.5405
## F-statistic: 6672 on 4 and 22682 DF, p-value: < 2.2e-16
house_full<-lm(AdjSalePrice~SqFtTotLiving+SqFtTotLiving+Bathrooms+Bedrooms+BldgGrade+PropertyType+NbrLivingUnits+SqFtFinBasement+YrBuilt+YrRenovated+NewConstruction,data=rhouse,na.action=na.omit)
step_lm<-stepAIC(house_full,direction = "both")## Start: AIC=563145.2
## AdjSalePrice ~ SqFtTotLiving + SqFtTotLiving + Bathrooms + Bedrooms +
## BldgGrade + PropertyType + NbrLivingUnits + SqFtFinBasement +
## YrBuilt + YrRenovated + NewConstruction
##
## Df Sum of Sq RSS AIC
## - NbrLivingUnits 1 7.1352e+09 1.3663e+15 563143
## - NewConstruction 1 1.8117e+10 1.3663e+15 563144
## - YrRenovated 1 2.4309e+10 1.3663e+15 563144
## <none> 1.3663e+15 563145
## - SqFtFinBasement 1 1.3058e+11 1.3664e+15 563145
## - PropertyType 2 4.3784e+12 1.3707e+15 563214
## - Bathrooms 1 7.5665e+12 1.3738e+15 563269
## - Bedrooms 1 2.8532e+13 1.3948e+15 563612
## - YrBuilt 1 1.2903e+14 1.4953e+15 565190
## - SqFtTotLiving 1 1.3653e+14 1.5028e+15 565304
## - BldgGrade 1 1.9039e+14 1.5567e+15 566103
##
## Step: AIC=563143.3
## AdjSalePrice ~ SqFtTotLiving + Bathrooms + Bedrooms + BldgGrade +
## PropertyType + SqFtFinBasement + YrBuilt + YrRenovated +
## NewConstruction
##
## Df Sum of Sq RSS AIC
## - NewConstruction 1 1.8438e+10 1.3663e+15 563142
## - YrRenovated 1 2.4887e+10 1.3663e+15 563142
## <none> 1.3663e+15 563143
## - SqFtFinBasement 1 1.2848e+11 1.3664e+15 563143
## + NbrLivingUnits 1 7.1352e+09 1.3663e+15 563145
## - PropertyType 2 4.3871e+12 1.3707e+15 563212
## - Bathrooms 1 7.6846e+12 1.3740e+15 563269
## - Bedrooms 1 2.8590e+13 1.3949e+15 563611
## - YrBuilt 1 1.3009e+14 1.4964e+15 565205
## - SqFtTotLiving 1 1.3680e+14 1.5031e+15 565306
## - BldgGrade 1 1.9166e+14 1.5579e+15 566120
##
## Step: AIC=563141.6
## AdjSalePrice ~ SqFtTotLiving + Bathrooms + Bedrooms + BldgGrade +
## PropertyType + SqFtFinBasement + YrBuilt + YrRenovated
##
## Df Sum of Sq RSS AIC
## - YrRenovated 1 2.5199e+10 1.3663e+15 563140
## <none> 1.3663e+15 563142
## - SqFtFinBasement 1 1.3668e+11 1.3664e+15 563142
## + NewConstruction 1 1.8438e+10 1.3663e+15 563143
## + NbrLivingUnits 1 7.4554e+09 1.3663e+15 563144
## - PropertyType 2 4.4634e+12 1.3708e+15 563212
## - Bathrooms 1 7.6799e+12 1.3740e+15 563267
## - Bedrooms 1 2.8572e+13 1.3949e+15 563609
## - YrBuilt 1 1.3748e+14 1.5038e+15 565315
## - SqFtTotLiving 1 1.3751e+14 1.5038e+15 565315
## - BldgGrade 1 1.9234e+14 1.5586e+15 566128
##
## Step: AIC=563140.1
## AdjSalePrice ~ SqFtTotLiving + Bathrooms + Bedrooms + BldgGrade +
## PropertyType + SqFtFinBasement + YrBuilt
##
## Df Sum of Sq RSS AIC
## <none> 1.3663e+15 563140
## - SqFtFinBasement 1 1.4116e+11 1.3665e+15 563140
## + YrRenovated 1 2.5199e+10 1.3663e+15 563142
## + NewConstruction 1 1.8750e+10 1.3663e+15 563142
## + NbrLivingUnits 1 8.0521e+09 1.3663e+15 563142
## - PropertyType 2 4.4415e+12 1.3708e+15 563210
## - Bathrooms 1 7.7109e+12 1.3740e+15 563266
## - Bedrooms 1 2.8553e+13 1.3949e+15 563607
## - SqFtTotLiving 1 1.3748e+14 1.5038e+15 565313
## - YrBuilt 1 1.5080e+14 1.5171e+15 565513
## - BldgGrade 1 1.9234e+14 1.5587e+15 566126
step_lm##
## Call:
## lm(formula = AdjSalePrice ~ SqFtTotLiving + Bathrooms + Bedrooms +
## BldgGrade + PropertyType + SqFtFinBasement + YrBuilt, data = rhouse,
## na.action = na.omit)
##
## Coefficients:
## (Intercept) SqFtTotLiving
## 6.179e+06 1.993e+02
## Bathrooms Bedrooms
## 4.240e+04 -5.195e+04
## BldgGrade PropertyTypeSingle Family
## 1.372e+05 2.291e+04
## PropertyTypeTownhouse SqFtFinBasement
## 8.448e+04 7.047e+00
## YrBuilt
## -3.565e+03
rhouse$Year=year(rhouse$DocumentDate)
rhouse$Weight=rhouse$Year-2005
house_wt<-lm(AdjSalePrice~SqFtTotLiving+SqFtTotLiving+Bathrooms+Bedrooms+BldgGrade,
data=rhouse,weight=Weight,na.action = na.omit)
round(cbind(house_lm=house_lm$coefficients,
house_wt=house_wt$coefficients),digits=3)## house_lm house_wt
## (Intercept) -522362.016 -587456.013
## SqFtTotLiving 228.229 241.498
## Bathrooms -19245.292 -25106.792
## Bedrooms -47639.407 -52703.656
## BldgGrade 106128.136 115480.402
head(rhouse[,'PropertyType'])## [1] "Multiplex" "Single Family" "Single Family" "Single Family"
## [5] "Single Family" "Townhouse"
#Representasi variabel dan dummy
prop_type_dummies<-model.matrix(~PropertyType-1,data = rhouse)
head(prop_type_dummies)## PropertyTypeMultiplex PropertyTypeSingle Family PropertyTypeTownhouse
## 1 1 0 0
## 2 0 1 0
## 3 0 1 0
## 4 0 1 0
## 5 0 1 0
## 6 0 0 1
house_dm<-lm(AdjSalePrice~SqFtTotLiving+SqFtTotLiving+Bathrooms+Bedrooms+BldgGrade+PropertyType,data=rhouse)
summary(house_dm)##
## Call:
## lm(formula = AdjSalePrice ~ SqFtTotLiving + SqFtTotLiving + Bathrooms +
## Bedrooms + BldgGrade + PropertyType, data = rhouse)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1193053 -118582 -20434 86455 9477569
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.471e+05 2.236e+04 -19.994 < 2e-16 ***
## SqFtTotLiving 2.228e+02 4.096e+00 54.390 < 2e-16 ***
## Bathrooms -1.583e+04 3.808e+03 -4.157 3.24e-05 ***
## Bedrooms -5.071e+04 2.533e+03 -20.018 < 2e-16 ***
## BldgGrade 1.094e+05 2.458e+03 44.521 < 2e-16 ***
## PropertyTypeSingle Family -8.494e+04 1.665e+04 -5.102 3.38e-07 ***
## PropertyTypeTownhouse -1.149e+05 1.816e+04 -6.327 2.54e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 261000 on 22680 degrees of freedom
## Multiple R-squared: 0.5414, Adjusted R-squared: 0.5413
## F-statistic: 4463 on 6 and 22680 DF, p-value: < 2.2e-16
house_eu<-lm(AdjSalePrice~SqFtTotLiving*Weight+SqFtLot+ Bathrooms+Bedrooms+BldgGrade+PropertyType, data = rhouse, na.action=na.omit)
summary(house_eu)##
## Call:
## lm(formula = AdjSalePrice ~ SqFtTotLiving * Weight + SqFtLot +
## Bathrooms + Bedrooms + BldgGrade + PropertyType, data = rhouse,
## na.action = na.omit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1139360 -116547 -20236 85523 9582582
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.951e+05 2.296e+04 -17.205 < 2e-16 ***
## SqFtTotLiving 1.966e+02 4.787e+00 41.063 < 2e-16 ***
## Weight -1.626e+04 1.886e+03 -8.621 < 2e-16 ***
## SqFtLot -1.058e-01 6.112e-02 -1.731 0.0834 .
## Bathrooms -1.568e+04 3.799e+03 -4.128 3.67e-05 ***
## Bedrooms -5.097e+04 2.531e+03 -20.141 < 2e-16 ***
## BldgGrade 1.089e+05 2.451e+03 44.431 < 2e-16 ***
## PropertyTypeSingle Family -8.561e+04 1.660e+04 -5.157 2.52e-07 ***
## PropertyTypeTownhouse -1.164e+05 1.811e+04 -6.426 1.34e-10 ***
## SqFtTotLiving:Weight 9.226e+00 8.331e-01 11.074 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 260200 on 22677 degrees of freedom
## Multiple R-squared: 0.5442, Adjusted R-squared: 0.544
## F-statistic: 3008 on 9 and 22677 DF, p-value: < 2.2e-16
#HeteroSkedastisitas,Non-Normality and Correlated Errors
df<-data.frame(
resid=residuals(house_eu),
pred=predict(house_eu))
graph<-ggplot(df,aes(pred,abs(resid)))+
geom_point()+
geom_smooth()+
scale_x_continuous(labels=function(x)format(x,scientific = FALSE))+
theme_bw()
graph## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
Plot Residual Parsial dan Nonlinier
terms<-predict(house_eu,type='terms')
partial_resid<-resid(house_eu)+terms
df<-data.frame(SqFtTotLiving=rhouse[,'SqFtTotLiving'],
Terms=terms[,'SqFtTotLiving'],
PartialResid=partial_resid[,'SqFtTotLiving'])
graph<-ggplot(df,aes(SqFtTotLiving,PartialResid))+
geom_point(shape=1)+scale_shape(solid=FALSE)+
geom_smooth(linetype=2)+
geom_line(aes(SqFtTotLiving,Terms))+
scale_y_continuous(labels=function(x) format(x,scientific = FALSE))
graph## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
Regresi Polinomial
lm_poly<-lm(AdjSalePrice~poly(SqFtTotLiving,2)+SqFtLot+BldgGrade+Bathrooms+Bedrooms,
data=rhouse)
terms<-predict(lm_poly,type='terms')
partial_resid<-resid(lm_poly)+terms
summary(lm_poly)##
## Call:
## lm(formula = AdjSalePrice ~ poly(SqFtTotLiving, 2) + SqFtLot +
## BldgGrade + Bathrooms + Bedrooms, data = rhouse)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2410341 -105382 -24376 74292 8153691
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.682e+05 2.027e+04 -13.231 < 2e-16 ***
## poly(SqFtTotLiving, 2)1 2.615e+07 5.067e+05 51.619 < 2e-16 ***
## poly(SqFtTotLiving, 2)2 1.503e+07 2.515e+05 59.752 < 2e-16 ***
## SqFtLot -2.371e-01 5.695e-02 -4.163 3.16e-05 ***
## BldgGrade 1.173e+05 2.235e+03 52.475 < 2e-16 ***
## Bathrooms -2.688e+03 3.382e+03 -0.795 0.427
## Bedrooms -1.751e+04 2.369e+03 -7.391 1.51e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 242800 on 22680 degrees of freedom
## Multiple R-squared: 0.6031, Adjusted R-squared: 0.603
## F-statistic: 5743 on 6 and 22680 DF, p-value: < 2.2e-16
df<-data.frame(SqFtTotLiving=rhouse[,'SqFtTotLiving'],
Terms=terms[,1],
PartialResid=partial_resid[,1])
graph<-ggplot(df,aes(SqFtTotLiving,PartialResid))+
geom_point(shape=1)+scale_shape(solid=FALSE)+
geom_smooth(linetype=2)+
geom_line(aes(SqFtTotLiving,Terms))+
scale_y_continuous(labels=function(x) format(x,scientific = FALSE))
graph## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
Regresi Spline
knots<-quantile(rhouse$SqFtTotLiving,p=c(.25,.5,.75))
lm_spline<-lm(AdjSalePrice~bs(SqFtTotLiving,knots=knots,degree=3)+SqFtLot+Bathrooms+Bedrooms+BldgGrade,data=rhouse)
summary(lm_spline)##
## Call:
## lm(formula = AdjSalePrice ~ bs(SqFtTotLiving, knots = knots,
## degree = 3) + SqFtLot + Bathrooms + Bedrooms + BldgGrade,
## data = rhouse)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2833721 -105586 -24250 73311 8044949
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) -2.568e+05 4.825e+04 -5.323
## bs(SqFtTotLiving, knots = knots, degree = 3)1 -1.751e+05 6.419e+04 -2.727
## bs(SqFtTotLiving, knots = knots, degree = 3)2 -1.375e+05 4.400e+04 -3.124
## bs(SqFtTotLiving, knots = knots, degree = 3)3 -6.215e+04 4.941e+04 -1.258
## bs(SqFtTotLiving, knots = knots, degree = 3)4 2.573e+05 5.284e+04 4.870
## bs(SqFtTotLiving, knots = knots, degree = 3)5 2.168e+06 9.869e+04 21.965
## bs(SqFtTotLiving, knots = knots, degree = 3)6 6.194e+06 1.656e+05 37.405
## SqFtLot -2.366e-01 5.688e-02 -4.159
## Bathrooms -8.508e+03 3.463e+03 -2.457
## Bedrooms -1.901e+04 2.389e+03 -7.956
## BldgGrade 1.201e+05 2.262e+03 53.078
## Pr(>|t|)
## (Intercept) 1.03e-07 ***
## bs(SqFtTotLiving, knots = knots, degree = 3)1 0.00640 **
## bs(SqFtTotLiving, knots = knots, degree = 3)2 0.00179 **
## bs(SqFtTotLiving, knots = knots, degree = 3)3 0.20843
## bs(SqFtTotLiving, knots = knots, degree = 3)4 1.12e-06 ***
## bs(SqFtTotLiving, knots = knots, degree = 3)5 < 2e-16 ***
## bs(SqFtTotLiving, knots = knots, degree = 3)6 < 2e-16 ***
## SqFtLot 3.20e-05 ***
## Bathrooms 0.01401 *
## Bedrooms 1.85e-15 ***
## BldgGrade < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 242400 on 22676 degrees of freedom
## Multiple R-squared: 0.6047, Adjusted R-squared: 0.6045
## F-statistic: 3468 on 10 and 22676 DF, p-value: < 2.2e-16
terms1<-predict(lm_spline,type='terms')
partial_resid1<-resid(lm_spline)+terms
df1<-data.frame(SqFtTotLiving=rhouse[,'SqFtTotLiving'],
Terms=terms1[,1],
PartialResid=partial_resid1[,1])
graph<-ggplot(df,aes(SqFtTotLiving,PartialResid))+
geom_point(shape=1)+scale_shape(solid=FALSE)+
geom_smooth(linetype=2)+
geom_line(aes(SqFtTotLiving,Terms))+
scale_y_continuous(labels=function(x) format(x,scientific = FALSE))
graph## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
# Model Aditif Umum
lm_gam<-gam(AdjSalePrice~s(SqFtTotLiving)+SqFtLot+Bathrooms+Bedrooms+BldgGrade,data = rhouse)
terms<-predict.gam(lm_gam,type='terms')
partial_resid<-resid(lm_gam)+terms
summary(lm_gam)##
## Family: gaussian
## Link function: identity
##
## Formula:
## AdjSalePrice ~ s(SqFtTotLiving) + SqFtLot + Bathrooms + Bedrooms +
## BldgGrade
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.644e+05 2.036e+04 -12.989 < 2e-16 ***
## SqFtLot -2.250e-01 5.676e-02 -3.965 7.36e-05 ***
## Bathrooms -8.535e+03 3.439e+03 -2.482 0.0131 *
## Bedrooms -1.938e+04 2.375e+03 -8.158 3.57e-16 ***
## BldgGrade 1.193e+05 2.251e+03 52.978 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(SqFtTotLiving) 8.98 9 877.8 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.607 Deviance explained = 60.8%
## GCV = 5.8357e+10 Scale est. = 5.8321e+10 n = 22687
df<-data.frame(SqFtTotLiving=rhouse[,'SqFtTotLiving'],
Terms=terms[,5],
PartialResid=partial_resid[,5])
graph<-ggplot(df,aes(SqFtTotLiving,PartialResid))+
geom_point(shape=1)+scale_shape(solid=FALSE)+
geom_smooth(linetype=2)+
geom_line(aes(SqFtTotLiving,Terms))+
scale_y_continuous(labels=function(x) format(x,scientific = FALSE))
graph## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'