Read Data
#use read.csv() to read in data
ames <- read.csv("/Users/KAP/Documents/School/02 - MSBA/2 - Statistics for Business Analytics/AmesHousing2.csv")
Week 1: Summarizing Data and Sampling
#*****SUMMARIZE DATA*****#
#Numeric Variables
#The func mean() and median() is used to compute mean/median
mean(ames$SalePrice)
## [1] 180796.1
median(ames$SalePrice)
## [1] 160000
#Mean of multiple columns using colMeans() function
colMeans(ames[sapply(ames, is.numeric)])
## Order Lot.Frontage Lot.Area Overall.Qual Overall.Cond
## 1.465500e+03 NA 1.014792e+04 6.094881e+00 5.563140e+00
## Year.Built Year.Remod.Add Bsmt.Unf.SF Total.Bsmt.SF BaseLivArea
## 1.971356e+03 1.984267e+03 NA NA 4.921840e+02
## X1st.Flr.SF X2nd.Flr.SF Gr.Liv.Area Full.Bath Half.Bath
## 1.159558e+03 3.354560e+02 1.499690e+03 1.566553e+00 3.795222e-01
## Bathrooms Bedroom.AbvGr Kitchen.AbvGr TotRms.AbvGrd Fireplaces
## 1.756314e+00 2.854266e+00 1.044369e+00 6.443003e+00 5.993174e-01
## Garage.Area Wood.Deck.SF Open.Porch.SF Enclosed.Porch X3Ssn.Porch
## NA 9.375188e+01 4.753345e+01 2.301160e+01 2.592491e+00
## Screen.Porch Mo.Sold Yr.Sold SalePrice
## 1.600205e+01 6.216041e+00 2.007790e+03 1.807961e+05
#If you have any missing values that should be ignored
colMeans(ames[sapply(ames, is.numeric)], na.rm=TRUE)
## Order Lot.Frontage Lot.Area Overall.Qual Overall.Cond
## 1.465500e+03 6.922459e+01 1.014792e+04 6.094881e+00 5.563140e+00
## Year.Built Year.Remod.Add Bsmt.Unf.SF Total.Bsmt.SF BaseLivArea
## 1.971356e+03 1.984267e+03 5.592625e+02 1.051615e+03 4.921840e+02
## X1st.Flr.SF X2nd.Flr.SF Gr.Liv.Area Full.Bath Half.Bath
## 1.159558e+03 3.354560e+02 1.499690e+03 1.566553e+00 3.795222e-01
## Bathrooms Bedroom.AbvGr Kitchen.AbvGr TotRms.AbvGrd Fireplaces
## 1.756314e+00 2.854266e+00 1.044369e+00 6.443003e+00 5.993174e-01
## Garage.Area Wood.Deck.SF Open.Porch.SF Enclosed.Porch X3Ssn.Porch
## 4.728197e+02 9.375188e+01 4.753345e+01 2.301160e+01 2.592491e+00
## Screen.Porch Mo.Sold Yr.Sold SalePrice
## 1.600205e+01 6.216041e+00 2.007790e+03 1.807961e+05
#Mode of a var is computed using mfv()func in the modest R package
mfv(ames$Mo.Sold)
## [1] 6
#Quantile func gives min, max and 3 quartiles
quantile(ames$SalePrice)
## 0% 25% 50% 75% 100%
## 12789 129500 160000 213500 755000
#Compute deciles (seq gives probabilities in values between 0 and 1)
quantile(ames$SalePrice, seq(0, 1, 0.1))
## 0% 10% 20% 30% 40% 50% 60% 70%
## 12789.0 105450.0 124000.0 135000.0 146500.0 160000.0 178536.0 199500.0
## 80% 90% 100%
## 230000.0 281241.7 755000.0
#Compute overall summary of a variable
#If the column is a numeric variable, mean, median, min, max and quartiles are returned.
#Categorical variables are stored into a factor in R
#If the column is a factor variable, the number of observations in each group is returned.
summary(ames$SalePrice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 12789 129500 160000 180796 213500 755000
#Categorical Variables (count)
table(ames$Bldg.Type)
##
## 1Fam 2fmCon Duplex Twnhs TwnhsE
## 2425 62 109 101 233
#Count in percentage
table(ames$Bldg.Type)/nrow(ames)
##
## 1Fam 2fmCon Duplex Twnhs TwnhsE
## 0.82764505 0.02116041 0.03720137 0.03447099 0.07952218
#*****SAMPLING*****#
# GENERATE A RANDOM SAMPLE OF SIZE 100
rnorm(n=100, mean = 15, sd= 6)
## [1] 16.75622780 8.16419192 15.60728599 17.51960715 17.57796063 12.98722622
## [7] 14.16273832 15.97625259 13.29205379 13.96767337 12.90834901 16.68244360
## [13] 4.46487095 3.35807447 23.10103347 18.93839768 10.04724240 26.66551515
## [19] 20.81420622 10.75944430 13.64387453 15.46035213 8.34392092 20.87654151
## [25] 12.46609229 10.87382351 12.44834926 10.92443575 21.46025306 9.94062438
## [31] 14.13387433 13.74577930 23.52771038 16.35024676 12.20843018 0.08748966
## [37] 5.81357682 16.01764646 18.06445810 20.52347297 9.17207574 20.62620500
## [43] 21.31819120 7.08327746 3.41010815 16.91708732 13.72520858 9.08926876
## [49] 14.95325486 21.66039494 12.00784910 8.37056396 16.59159429 10.55510419
## [55] 8.33446953 21.85680913 4.80748742 14.39455793 23.04420476 18.76003843
## [61] 16.70052040 5.56025373 18.68000090 8.43453507 18.73700796 3.84706878
## [67] 16.61942229 15.87927552 9.13172069 6.09697766 19.79429457 20.38876310
## [73] 15.94359014 7.76213003 9.81809185 17.26651516 12.50890153 18.59172966
## [79] 15.85644772 22.86546997 12.92925633 22.53647892 20.24181925 17.27501869
## [85] 16.47940169 18.74982297 15.54325285 19.89758151 17.92858534 7.94706655
## [91] 19.47092910 29.71529770 11.44714126 13.60318841 8.37474176 22.72923315
## [97] 24.08987315 10.84944888 8.71826703 23.65582484
#There’s quite a number of sampling methods we can undertake in R
#SIMPLE RANDOM SAMPLE
#Create a vector of 100 random numbers puled from the normal distribution
nd <- rnorm(100)
#Sample 10 of those numbers at random (SRS)
#Use the sample function is used to generate a random sample from a given population
sample(nd, 10)
## [1] 0.05662768 2.41552638 -1.22220786 -0.15597747 -1.04228030 0.45661309
## [7] -0.73608084 -1.03843872 0.74064982 0.07551901
#Selecting ramdom sample from a data frame (AMES DATA)
#Two step process; sample all the row vectors and select 10 randomly
index <-sample(1:nrow(ames), 10)
#Subset the dataset using this index to get random sample
ames[index, ]
## Order Lot.Frontage Lot.Area Lot.Shape Utilities Lot.Config Land.Slope
## 827 2217 NA 11275 IR1 AllPub Corner Mod
## 2166 1728 59 15810 IR1 AllPub Inside Gtl
## 2537 2300 69 7599 Reg AllPub Corner Gtl
## 247 533 65 8127 IR1 AllPub Inside Gtl
## 1371 470 51 3635 Reg AllPub Inside Gtl
## 700 1812 NA 11613 IR2 AllPub Corner Gtl
## 2368 2048 59 4484 IR1 AllPub Corner Gtl
## 2614 2487 70 9100 Reg AllPub Inside Gtl
## 1547 753 92 5520 Reg AllPub Corner Gtl
## 2663 2568 85 8500 Reg AllPub Inside Gtl
## Neighborhood Bldg.Type House.Style Overall.Qual Overall.Cond Year.Built
## 827 Crawfor 1Fam 1.5Fin 6 7 1932
## 2166 Gilbert 1Fam 2Story 6 5 2007
## 2537 Mitchel 1Fam 1Story 4 6 1982
## 247 Somerst 1Fam 2Story 7 5 2003
## 1371 Blmngtn TwnhsE 1Story 7 5 2007
## 700 SawyerW 1Fam 2Story 6 5 1993
## 2368 SWISU 2fmCon 1.5Fin 5 6 1942
## 2614 Sawyer Duplex 1Story 5 5 1963
## 1547 OldTown 1Fam 2.5Fin 6 6 1912
## 2663 NAmes 1Fam 1Story 5 3 1961
## Year.Remod.Add Foundation Bsmt.Unf.SF Total.Bsmt.SF BaseLivArea
## 827 1950 CBlock 0 854 854
## 2166 2007 CBlock 768 768 0
## 2537 2006 CBlock 280 845 565
## 247 2003 PConc 402 812 410
## 1371 2007 PConc 398 1386 988
## 700 1997 PConc 384 864 480
## 2368 1979 CBlock 187 672 485
## 2614 1963 CBlock 396 1728 1332
## 1547 1950 CBlock 755 755 0
## 2663 1961 CBlock 635 1235 600
## Central.Air X1st.Flr.SF X2nd.Flr.SF Gr.Liv.Area Full.Bath Half.Bath
## 827 Y 1096 895 1991 1 1
## 2166 Y 768 728 1496 3 0
## 2537 Y 845 0 845 1 0
## 247 Y 812 841 1653 2 1
## 1371 Y 1569 0 1569 2 0
## 700 Y 920 900 1820 2 1
## 2368 N 778 504 1282 1 0
## 2614 Y 1728 0 1728 2 0
## 1547 Y 929 929 2229 1 0
## 2663 Y 1235 0 1235 1 0
## Bathrooms Bedroom.AbvGr Kitchen.AbvGr Kitchen.Qual TotRms.AbvGrd
## 827 1.5 3 1 TA 7
## 2166 3.0 3 1 Gd 7
## 2537 1.0 2 1 TA 4
## 247 2.5 3 1 Gd 6
## 1371 2.0 1 1 Gd 7
## 700 2.5 3 1 Gd 7
## 2368 1.0 2 1 TA 4
## 2614 2.0 6 2 TA 10
## 1547 1.0 5 1 TA 8
## 2663 1.0 2 1 TA 6
## Fireplaces Garage.Type Garage.Area Wood.Deck.SF Open.Porch.SF
## 827 1 Detchd 432 0 0
## 2166 0 Attchd 572 100 0
## 2537 0 Detchd 360 0 0
## 247 0 Attchd 628 0 45
## 1371 1 Attchd 660 143 20
## 700 0 Attchd 492 144 85
## 2368 0 Detchd 240 0 88
## 2614 0 Detchd 504 0 0
## 1547 0 <NA> 0 0 198
## 2663 0 Attchd 480 0 0
## Enclosed.Porch X3Ssn.Porch Screen.Porch Mo.Sold Yr.Sold Sale.Condition
## 827 19 0 0 3 2007 Normal
## 2166 0 0 0 5 2007 Partial
## 2537 0 0 0 6 2007 Normal
## 247 0 0 0 3 2009 Normal
## 1371 0 0 0 5 2009 Normal
## 700 0 0 0 5 2007 Normal
## 2368 0 0 0 7 2007 Normal
## 2614 0 0 0 11 2006 Abnorml
## 1547 30 0 0 7 2009 Abnorml
## 2663 0 0 0 12 2006 Abnorml
## SalePrice
## 827 220000
## 2166 181755
## 2537 129500
## 247 209000
## 1371 175900
## 700 191750
## 2368 108500
## 2614 125000
## 1547 104000
## 2663 98600
#STRATIFIED RANDOM SAMPLE
#In package dplyr, use group_by function for the strata and sample_n for no. of samples in each group
ames%>%
group_by(Bldg.Type)%>%
sample_n(5)
## # A tibble: 25 x 41
## # Groups: Bldg.Type [5]
## Order Lot.Frontage Lot.Area Lot.Shape Utilities Lot.Config Land.Slope
## <int> <int> <int> <chr> <chr> <chr> <chr>
## 1 319 92 12003 Reg AllPub Corner Gtl
## 2 427 98 12704 Reg AllPub Inside Gtl
## 3 1285 30 5232 IR3 AllPub Inside Gtl
## 4 2866 70 12320 IR1 AllPub Inside Gtl
## 5 343 85 10200 Reg AllPub Inside Gtl
## 6 1415 70 7000 Reg AllPub Inside Gtl
## 7 1516 55 5687 Reg AllPub Inside Gtl
## 8 2871 90 15750 Reg AllPub Corner Gtl
## 9 2283 NA 32463 Reg AllPub Inside Mod
## 10 2524 79 13110 IR1 AllPub Corner Gtl
## # … with 15 more rows, and 34 more variables: Neighborhood <chr>,
## # Bldg.Type <chr>, House.Style <chr>, Overall.Qual <int>, Overall.Cond <int>,
## # Year.Built <int>, Year.Remod.Add <int>, Foundation <chr>,
## # Bsmt.Unf.SF <int>, Total.Bsmt.SF <int>, BaseLivArea <int>,
## # Central.Air <chr>, X1st.Flr.SF <int>, X2nd.Flr.SF <int>, Gr.Liv.Area <int>,
## # Full.Bath <int>, Half.Bath <int>, Bathrooms <dbl>, Bedroom.AbvGr <int>,
## # Kitchen.AbvGr <int>, Kitchen.Qual <chr>, TotRms.AbvGrd <int>,
## # Fireplaces <int>, Garage.Type <chr>, Garage.Area <int>, Wood.Deck.SF <int>,
## # Open.Porch.SF <int>, Enclosed.Porch <int>, X3Ssn.Porch <int>,
## # Screen.Porch <int>, Mo.Sold <int>, Yr.Sold <int>, Sale.Condition <chr>,
## # SalePrice <int>
#SYSTEMATIC SAMPLE
#Write a function indicating the step k that has to be taken in the dataset
#N is the total rows, n is every nth number, k = N/n the step
#seq function ties the arguments to generate sequence of numbers
#sample randomly reorders the elements and picks one
obtain_sys = function(N,n){
k=ceiling(N/n)
r=sample(1:k, 1)
seq(r, r+k*(n-1), k)
}
sys.sample = ames[obtain_sys(nrow(ames), 10), ]
head(sys.sample)
## Order Lot.Frontage Lot.Area Lot.Shape Utilities Lot.Config Land.Slope
## 121 246 83 11980 Reg AllPub Inside Mod
## 414 1048 24 2308 Reg AllPub FR2 Gtl
## 707 1833 NA 9572 IR1 AllPub Inside Gtl
## 1000 2593 94 22136 Reg AllPub Inside Gtl
## 1293 322 89 13214 IR1 AllPub Inside Gtl
## 1586 803 90 14684 IR1 AllPub CulDSac Gtl
## Neighborhood Bldg.Type House.Style Overall.Qual Overall.Cond Year.Built
## 121 SawyerW 1Fam 1Story 7 5 1987
## 414 NPkVill TwnhsE 2Story 6 6 1975
## 707 NoRidge 1Fam 2Story 8 5 1990
## 1000 OldTown 2fmCon 1.5Fin 5 5 1925
## 1293 Timber 1Fam 1Story 9 5 2008
## 1586 SawyerW 1Fam 1Story 7 7 1990
## Year.Remod.Add Foundation Bsmt.Unf.SF Total.Bsmt.SF BaseLivArea
## 121 1987 CBlock 0 1433 1433
## 414 1975 CBlock 275 855 580
## 707 1990 PConc 971 1453 482
## 1000 1975 CBlock 1153 2171 1018
## 1293 2009 PConc 2002 2002 0
## 1586 1991 CBlock 1496 2158 662
## Central.Air X1st.Flr.SF X2nd.Flr.SF Gr.Liv.Area Full.Bath Half.Bath
## 121 Y 1433 0 1433 1 1
## 414 Y 855 601 1456 2 1
## 707 Y 1453 1357 2810 2 1
## 1000 Y 1392 1248 2640 2 1
## 1293 Y 2018 0 2018 2 0
## 1586 Y 2196 0 2196 2 0
## Bathrooms Bedroom.AbvGr Kitchen.AbvGr Kitchen.Qual TotRms.AbvGrd
## 121 1.5 1 1 Gd 4
## 414 2.5 4 1 TA 7
## 707 2.5 4 1 Gd 9
## 1000 2.5 5 1 TA 10
## 1293 2.0 3 1 Ex 10
## 1586 2.0 3 1 Gd 7
## Fireplaces Garage.Type Garage.Area Wood.Deck.SF Open.Porch.SF
## 121 2 Attchd 528 0 278
## 414 0 Attchd 460 0 0
## 707 1 Attchd 750 500 0
## 1000 1 Attchd 1008 631 48
## 1293 1 Attchd 746 144 76
## 1586 1 Attchd 701 84 70
## Enclosed.Porch X3Ssn.Porch Screen.Porch Mo.Sold Yr.Sold Sale.Condition
## 121 0 0 266 6 2010 Normal
## 414 0 0 0 5 2008 Normal
## 707 0 0 0 6 2007 Normal
## 1000 148 0 0 7 2006 Normal
## 1293 0 0 0 5 2010 Normal
## 1586 0 0 0 6 2009 Normal
## SalePrice
## 121 270000
## 414 143000
## 707 302000
## 1000 180000
## 1293 378500
## 1586 271900
Week 2: Spread of Data, Contingency Tables, and Simple Plots
#*****SPREAD OF DATA*****#
# Spread (standard deviation, variance, IQR, Range)
# Skewness, Kurtosis
summary(ames$SalePrice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 12789 129500 160000 180796 213500 755000
#summary for group of variables (use na.rm for removing NAs)
#The piped function %>% is used from dplyr library
ames %>%
summarise(
count = n(),
mean_SP = mean(SalePrice, na.rm = TRUE),
mean_GA = mean(Gr.Liv.Area, na.rm = TRUE)
)
## count mean_SP mean_GA
## 1 2930 180796.1 1499.69
print(var(ames$SalePrice))
## [1] 6381883616
print(sd(ames$SalePrice))
## [1] 79886.69
print(range(ames$SalePrice))
## [1] 12789 755000
print(max(ames$SalePrice)-min(ames$SalePrice))
## [1] 742211
print(IQR(ames$SalePrice))
## [1] 84000
ames %>%
summarise(
count = n(),
sd_SP = sd(SalePrice, na.rm = TRUE),
sd_GA = sd(Gr.Liv.Area, na.rm = TRUE)
)
## count sd_SP sd_GA
## 1 2930 79886.69 505.5089
#Skewness and Kurtosis can be analyzed by using library(moments)
skewness(ames$SalePrice, na.rm = TRUE) #Right skewed, <-1 or >+1 highly skewed
## [1] 1.742607
kurtosis(ames$SalePrice, na.rm = TRUE) #Greater than 3 so Leptokurtic distribution - fat tails
## [1] 8.108122
ames %>%
summarise(
count = n(),
skew_SP = skewness(SalePrice, na.rm = TRUE),
skew_GA = skewness(Gr.Liv.Area, na.rm = TRUE)
)
## count skew_SP skew_GA
## 1 2930 1.742607 1.273457
#*****CONTINGENCY TABLE*****#
tab1<-table(ames$Kitchen.Qual, ames$Bldg.Type)
tab1
##
## 1Fam 2fmCon Duplex Twnhs TwnhsE
## Ex 174 0 0 2 29
## Fa 59 8 3 0 0
## Gd 967 3 4 39 147
## Po 1 0 0 0 0
## TA 1224 51 102 60 57
rowSums(tab1)
## Ex Fa Gd Po TA
## 205 70 1160 1 1494
colSums(tab1)
## 1Fam 2fmCon Duplex Twnhs TwnhsE
## 2425 62 109 101 233
#in Percents of column variable (use margin=2 for % by col, margin=1 for % by row)
#prop.table nested with table gives frequencies by %
prop.table((tab1), margin=2)*100
##
## 1Fam 2fmCon Duplex Twnhs TwnhsE
## Ex 7.17525773 0.00000000 0.00000000 1.98019802 12.44635193
## Fa 2.43298969 12.90322581 2.75229358 0.00000000 0.00000000
## Gd 39.87628866 4.83870968 3.66972477 38.61386139 63.09012876
## Po 0.04123711 0.00000000 0.00000000 0.00000000 0.00000000
## TA 50.47422680 82.25806452 93.57798165 59.40594059 24.46351931
prop.table((tab1), margin=1)*100
##
## 1Fam 2fmCon Duplex Twnhs TwnhsE
## Ex 84.8780488 0.0000000 0.0000000 0.9756098 14.1463415
## Fa 84.2857143 11.4285714 4.2857143 0.0000000 0.0000000
## Gd 83.3620690 0.2586207 0.3448276 3.3620690 12.6724138
## Po 100.0000000 0.0000000 0.0000000 0.0000000 0.0000000
## TA 81.9277108 3.4136546 6.8273092 4.0160643 3.8152610
#ftable function will give you 3 or more dimensions ftable(v1, v2, v3)
#*****SIMPLE PLOTS*****#
# Histogram, piechart, bar chart, scatter plot
#1. Scatter Plot
qplot(Gr.Liv.Area, SalePrice, data=ames)

qplot(Gr.Liv.Area, SalePrice, data=ames, geom=c("point", "smooth"))

#Same plot with ggplot
ggplot(ames, aes(x =Gr.Liv.Area, y = SalePrice)) +
geom_point()+
labs(title="Sale Price of Homes by Living Area",
x="Living Area (Sq ft)", y = "House Price ($)")

#Getting a linear model, The alpha parameter controls transparency, with alpha=0 completely transparent
ggplot(ames, aes(x = Gr.Liv.Area, y = SalePrice)) +
geom_point(alpha = 0.6) +
stat_smooth(
method = "lm",
color = "red",
se = FALSE
)

# Change point shapes, colors by bldg.type
ggplot(ames, aes(x=Gr.Liv.Area, y=SalePrice, shape=Bldg.Type, color=Bldg.Type)) +
geom_point()

#2. Histogram
qplot(data=ames, SalePrice, geom = 'histogram')

qplot(data=ames, SalePrice, geom = 'histogram', binwidth = 10000) + xlab('Price')

# Change histogram fill color by group (sex)
qplot(SalePrice, data = ames, geom = "histogram", binwidth=10000,
fill = Bldg.Type)

qplot(SalePrice, data = ames, geom = "histogram", binwidth=10000,
xlab = "Price of homes", ylab = "Frequency",
main = "Histogram of SalePrice of Homes")

#you can also use ggplot to make this same thing
ggplot(data=ames, aes(ames$SalePrice)) +
geom_histogram()
## Warning: Use of `ames$SalePrice` is discouraged. Use `SalePrice` instead.

#Histogram plot adjustment
qplot(ames$SalePrice,
geom="histogram",
binwidth=10000,
main="Histogram for SalePrice in Ames",
xlab="Price",
fill=I("blue"),
col=I("red"),
alpha=I(0.2),
)

#The I() function inhibits the interpretation of its arguments.
#In this case, the col argument or fill or alpha is affected.
#Without it, the qplot() function would print a legend,
#for example, saying that “col =”red“”
#Add a trendline to histogram
#use ggplot for more options on the histogram such as adding a line
hist<-ggplot(data=ames, aes(x=SalePrice)) +
geom_histogram(binwidth=10000, color="darkblue", fill="lightblue")
#Add mean line
hist+geom_vline(aes(xintercept=mean(SalePrice)),
color="blue", linetype="dashed", size=1)

#Add density curve
ggplot(ames, aes(x=SalePrice))+
geom_histogram(aes(y=..density..), color="black", fill="white")+
geom_density(alpha=0.2, fill="#FF6666")

#By Groups
ggplot(ames, aes(x=SalePrice, color=Bldg.Type))+
geom_histogram(binwidth=25000, fill="white", position="dodge")+
theme(legend.position="top")

#bars one besides the other by using dodge function
# 3. Barplot
ggplot(ames, aes(x = factor(Bldg.Type))) +
geom_bar()

#Control the barplot
#- `stat`: Control the type of formatting. By default, `bin` to plot a count in the y-axis. For continuous value, pass `stat = "identity"`
#- `alpha`: Control density of the color (0-1)
#- `fill`: Change the color of the bar
#- `size`: Control the size the bar
#Change intensity and color
# Change intensity
ggplot(ames,
aes(factor(Bldg.Type))) +
geom_bar(fill = "steelblue",
alpha = 0.5) +
theme_classic()

# Color by group
ggplot(ames, aes(factor(Bldg.Type),
fill = factor(Bldg.Type))) +
geom_bar()

#Side by side bars
ggplot(ames, aes(x = Bldg.Type, fill = House.Style)) +
geom_bar(position = position_dodge()) +
theme_classic()

#Turn your barplot horizontal with coord_flip() function
ggplot(ames, aes(x=Bldg.Type))+
geom_bar(fill="steelblue")+
labs(x="Type of Building", title="Building Type at Ames")+
coord_flip()

#Sorted alphabetically
## set the levels in order we want
ames <- within(ames,
Bldg.Type <- factor(Bldg.Type,
levels=names(sort(table(Bldg.Type),
increasing=TRUE))))
ggplot(ames, aes(x=Bldg.Type))+
geom_bar(fill="steelblue")+
labs(x="Type of Building", title="Building Type at Ames")+
coord_flip()

#By the Mean of a Variable SalePrice
#You create a data frame named data_bar
#It returns the average SalePrice by the Bldg.Type
#You call this new variable mean_SalePrice and round the mean with two decimals.
data_bar <- ames %>%
mutate(Bldg.Type = factor(Bldg.Type)) %>%
group_by(Bldg.Type) %>%
summarize(mean_SalePrice = round(mean(SalePrice), 2))
bar <-ggplot(data_bar, aes(x = Bldg.Type, y = mean_SalePrice)) +
geom_bar(stat = "identity")
#Add Labels (vjust= placement for vertical bars, hjust=horizontal bars)
bar +
geom_text(aes(label = mean_SalePrice),
vjust = 3.5,
color = "yellow",
size = 3) +
theme_classic()

Week 3: Other Graphical Means and Confidence Intervals
# Other graphical means of representing data
############# 1. Pie Chart ############
table(ames$Bldg.Type)
##
## 2fmCon Twnhs Duplex TwnhsE 1Fam
## 62 101 109 233 2425
bt_chart <-table(ames$Bldg.Type)
pie(bt_chart)

#1a change color
pie(bt_chart,
col=c("steelblue", "steelblue3", "steelblue2", "steelblue1", "skyblue1"))

pie(bt_chart,
col=gray(seq(0.4, 1.0, length = 5)))

#1b. show %labels
pct <- round(bt_chart/sum(bt_chart)*100) # calculate percentages
lbls <- paste(names(bt_chart), pct, "%") # add percents to labels
pie(bt_chart,
col=c("steelblue", "steelblue3", "steelblue2", "steelblue1", "skyblue1"),
labels=lbls)

#1c. pie3D; not in basic stat package so install the plotrix package
pie3D(bt_chart,
col=c("steelblue", "steelblue3", "steelblue2", "steelblue1", "skyblue1"),
labels = lbls,
labelcex = 1,
explode=0.1,
theta = 0.8,
main="3D Pie Chart of House Types")

############# 2. Box Plot ############
#2a. use the boxplot() function to create boxplots
boxplot(ames$SalePrice, na.rm=TRUE)

boxplot(ames$SalePrice,
horizontal = TRUE)

#side-by-side box plots
boxplot(SalePrice ~ Bldg.Type, data=ames, col='lightblue')

#You can change the colors of individual boxes by passing a vector of colors to the col argument
boxplot(SalePrice ~ Bldg.Type, data=ames,
col=c('lightblue', 'yellow', 'orange', 'lightgreen', 'coral'))

#2b. you can use ggplot2
ggplot(data=ames, aes(SalePrice)) +
geom_boxplot()

ggplot(data=ames, aes(SalePrice)) +
geom_boxplot() +
coord_flip()

ggplot(data=ames, aes(x=factor(Bldg.Type), y=SalePrice)) +
geom_boxplot()

ggplot(data=ames, aes(x=factor(Bldg.Type), y=SalePrice, fill=factor(Bldg.Type))) +
geom_boxplot()

ggplot(data=ames, aes(x=factor(Bldg.Type), y=SalePrice)) +
geom_boxplot(fill="lightblue1", color="dodgerblue")

ggplot(data=ames, aes(x=factor(Bldg.Type), y=SalePrice)) +
geom_boxplot(outlier.size=3, outlier.shape=18, outlier.color="red")

#2c. Identifying outliers
#ggplot defines an outlier by default as something that's > 1.5*IQR from the borders of the box
is_outlier <- function(x) {
return(x < quantile(x, 0.25) - 1.5 * IQR(x) | x > quantile(x, 0.75) + 1.5 * IQR(x))
}
dat <- ames %>% tibble::rownames_to_column(var="outlier") %>%
group_by(Bldg.Type) %>%
mutate(is_outlier=ifelse(is_outlier(SalePrice), SalePrice, as.numeric(NA)))
dat$outlier[which(is.na(dat$is_outlier))] <- as.numeric(NA)
ggplot(dat, aes(y=SalePrice, x=factor(Bldg.Type))) +
geom_boxplot() +
geom_text(aes(label=outlier),na.rm=TRUE,nudge_y=0.05)

############# 3. Q-Q Plot ############
#Is the variable normally distributed?
#A quantile-quantile plot (QQ plot) is a good first check
#It shows the distribution of the data against the expected normal distribution
#Plot normal QQ plot using the qqnorm() function= plots your sample against a normal distribution.
#qqline() function = adds a theoretical distribution line to your normal QQ plot
#This line makes it a lot easier to evaluate whether the points deviate from the reference line.
#Closer the points are to the reference line, the closer the sample data follows a normal distribution.
with(ames, {
qqnorm(SalePrice);
qqline(SalePrice)})

#If the data is normally distributed, the points fall on the 45° reference line
#If the data is non-normal, the points deviate noticeably from the reference line
#3a. The standard qqplot() functions do not provide confidence intervals
#but the qqplot() function from the car package does
with(ames, qqPlot(SalePrice))

## [1] 683 676
#gives you two outlier observations
#3b. For fun, take the log of SalePrice
sp_Log <- log(ames$SalePrice)
hist(sp_Log)

with(ames, {
qqnorm(sp_Log);
qqline(sp_Log)})

with(ames, qqPlot(sp_Log))

## [1] 1193 2060
############# 4. Confidence Interval for Mean############
#Compute mean for the sale Price and living area variable
mean_SP <- mean(ames$SalePrice, na.rm = TRUE)
mean_GA <- mean(ames$Gr.Liv.Area, na.rm = TRUE)
mean_SP
## [1] 180796.1
mean_GA
## [1] 1499.69
#compute Confidence Interval for a mean by adding and subtracting margin of error to point estimate
n=length(ames$SalePrice)
n
## [1] 2930
se_SP <- sd(ames$SalePrice, na.rm = TRUE)/sqrt(n)
se_GA <- sd(ames$Gr.Liv.Area, na.rm = TRUE)/sqrt(n)
se_SP
## [1] 1475.845
se_GA
## [1] 9.338884
#Em= Margin of Error for mean for both sides of the tails (.025 on both sides of the distribution)
#Since there are two tails of the normal distribution, the 95% confidence level
#would imply the 97.5th percentile of the normal distribution at the upper and lower tail
Em <- qt(.975, df=(n-1))*(se_SP)
Em
## [1] 2893.798
lower_SP <-mean_SP -Em
upper_SP <-mean_SP +Em
lower_GA <-mean_GA -Em
upper_GA <-mean_GA +Em
c(lower_SP, upper_SP)
## [1] 177902.3 183689.9
c(lower_GA, upper_GA)
## [1] -1394.108 4393.489
#This is an important inference; even though the population is unknown,
#we are 95% confident that true average sale price of the house and the size of houses in AMES
#lie between the values lower and upper
#There are a few conditions that have to be met for this interval to be valid
#1. Sample size >30 and an independent sample
#2. somewhat normal and unimodal population
############# 5. Confidence Interval for Proportion############
#proportion of single family homes (Bldg.Type)
n = length(ames$Bldg.Type)
n
## [1] 2930
table(ames$Bldg.Type)
##
## 2fmCon Twnhs Duplex TwnhsE 1Fam
## 62 101 109 233 2425
k = sum(ames$Bldg.Type =="1Fam", na.rm=TRUE)
k
## [1] 2425
pbar=k/n
pbar
## [1] 0.8276451
#82.7% homes single family
se_SF = sqrt(pbar* (1 - pbar)/(n))
se_SF
## [1] 0.006977505
#Since there are two tails of the normal distribution, the 95% confidence level
#would imply the 97.5th percentile of the normal distribution at the upper tail.
#Therefore, zα∕2 is given by qnorm(.975).
#Hence we multiply it with the standard error estimate SE and compute the margin of error.
Ep = qnorm(0.975)*(se_SF)
pbar + c(+Ep, -Ep)
## [1] 0.8413207 0.8139694
#ALTERNATIVE SOLUTION USING PROP.TEST built in stat
#Tests for null difference of proportion of Single family homes against 0.50 (null)
prop.test(k, n)
##
## 1-sample proportions test with continuity correction
##
## data: k out of n, null probability 0.5
## X-squared = 1256.8, df = 1, p-value < 2.2e-16
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
## 0.8133668 0.8410546
## sample estimates:
## p
## 0.8276451
#There is a condition for this interval to be valid:
#1. Sampling distribution of proportions should be normal or np and n(1-p)>= 5
#2. Data comes from a random sample
a1=n*(pbar)
a2=n*(1-(pbar))
a1
## [1] 2425
a2
## [1] 505
Week 4: Hypothesis Testing - Means and Proportions
#### 1. Test for means####
#Typical value of homes in Iowa is $158156 (zillow.com)
#Mean value of homes in Ames, Iowa differs from the value in Iowa
#one sample t-test
#two-tailed test
res <-t.test(ames$SalePrice, mu=158156)
res
##
## One Sample t-test
##
## data: ames$SalePrice
## t = 15.34, df = 2929, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 158156
## 95 percent confidence interval:
## 177902.3 183689.9
## sample estimates:
## mean of x
## 180796.1
#here t is the t-test statistic value t=15.34, df=2929, p-value=0.000
#mean of x is 180796 and confidence interval is given as well
#test whether mean value of homes in Ames is less than 158,156
#lower test for means
t.test(ames$SalePrice, mu=158156, alternative="less")
##
## One Sample t-test
##
## data: ames$SalePrice
## t = 15.34, df = 2929, p-value = 1
## alternative hypothesis: true mean is less than 158156
## 95 percent confidence interval:
## -Inf 183224.4
## sample estimates:
## mean of x
## 180796.1
#test whether the mean value of homes in Ames is greater than 158,156
t.test(ames$SalePrice, mu=158156, alternative="greater")
##
## One Sample t-test
##
## data: ames$SalePrice
## t = 15.34, df = 2929, p-value < 2.2e-16
## alternative hypothesis: true mean is greater than 158156
## 95 percent confidence interval:
## 178367.7 Inf
## sample estimates:
## mean of x
## 180796.1
#1a. Lower Test for means
#Suppose the manufacturer claims that the mean lifetime of a light bulb is more than 10,000 hours.
#In a sample of 50 light bulbs, it was found that they only last 9,900 hours on average.
#Assume the population standard deviation is 120 hours.
#At .05 significance level, can we reject the claim by the manufacturer?
#Null hypothesis: μ ≥ 10000
xbar=9900 #sample mean
mu0=10000 #hypothesized value
sigma=120 #population standard deviation
n=50 #sample size
t= (xbar-mu0)/(sigma/sqrt(n))
t #test statistic
## [1] -5.892557
pval = pt(t, df=49, lower.tail=TRUE)
pval #lower tail p value (pt gives the probability distribution function)
## [1] 1.712419e-07
#we find that the p value is less than 0.05 and thus, we reject the null that μ ≥ 10000
#critical value (if test statistic is less than this value, test is statistically sig)
alpha=0.05
t.alpha=qt(alpha, 49, lower.tail=TRUE) #use the quartile function to get the critical value at alpha)
t.alpha
## [1] -1.676551
#reject the null since the test statistic is lower than the critical value of -1.676)
#1b. Upper tailed test for means
#Suppose the food label on a cookie bag states that there is at most 2 grams of saturated fat
#in a single cookie. In a sample of 65 cookies, it is found that the mean amount
#of saturated fat per cookie is 2.1 grams.
#Assume that the population standard deviation is 0.25 grams.
#At .05 significance level, can we reject the claim on food label?
#The null hypothesis is μ ≤ 2
xbar=2.1 #sample mean
mu0=2 #hypothesized value
sigma=0.25 #population standard deviation
n=65 #sample size
t=(xbar-mu0)/(sigma/sqrt(n))
t #test statistic
## [1] 3.224903
pvalue=pt(t, df=64, lower.tail=FALSE)
pvalue #upper tail p-value
## [1] 0.000992899
#p value is 0.0009 so we reject the null
#critical value
alpha=0.05
t.alpha=qt(alpha, 64, lower.tail=FALSE)
t.alpha
## [1] 1.669013
# The test statistic value (3.22) should be larger than the t.alpha value (critical value, 1.669)
#to reject the null hypothesis. we reject the null.
#1c. Two-tailed test for means
#Suppose the mean weight of King Penguins found in an Antarctic colony last year was 15.4 kg.
#In a sample of 125 penguins same time this year in the same colony,
#the mean penguin weight is 14.6 kg.
#Assume the population standard deviation is 2.5 kg. At .05 significance level,
#can we reject the null hypothesis that the mean penguin weight does not differ from last year?
#The null hypothesis is that μ = 15.4
xbar=14.6 #sample mean
mu0=15.4 #hypothesized value
sigma=2.5 #population standard deviation
n=125 #sample size
t=(xbar-mu0)/(sigma/sqrt(n))
t #test statistic
## [1] -3.577709
pvalue=2*pt(t, df=124, lower.tail=TRUE)
pvalue #two-tail p-value (lower tail=TRUE since sample mean < mu)
## [1] 0.0004953873
#p value is 0.0004 so we reject the null)
#critical value
alpha=0.05
t.alpha=qt(alpha, 124)
t.alpha
## [1] -1.657235
#Since t statistic is -3.57 is lower than critical value of t = -1.657, reject the null
#### 2. Test for proportions####
#By default uses correction factor (binomial tests corrected to normal tests)
#80% of homes in Iowa are single family
k = sum(ames$Bldg.Type =="1Fam", na.rm=TRUE)
k
## [1] 2425
n=2930
k/n
## [1] 0.8276451
prop.test(k, n, p=.80, correct=FALSE) #two-tailed test
##
## 1-sample proportions test without continuity correction
##
## data: k out of n, null probability 0.8
## X-squared = 13.995, df = 1, p-value = 0.0001833
## alternative hypothesis: true p is not equal to 0.8
## 95 percent confidence interval:
## 0.8135426 0.8408895
## sample estimates:
## p
## 0.8276451
prop.test(k, n, p=0.80, correct=FALSE, alternative="less") #lower-tailed test#
##
## 1-sample proportions test without continuity correction
##
## data: k out of n, null probability 0.8
## X-squared = 13.995, df = 1, p-value = 0.9999
## alternative hypothesis: true p is less than 0.8
## 95 percent confidence interval:
## 0.0000000 0.8388184
## sample estimates:
## p
## 0.8276451
prop.test(k, n, p=0.80, correct=FALSE, alternative="greater") #upper-tailed test#
##
## 1-sample proportions test without continuity correction
##
## data: k out of n, null probability 0.8
## X-squared = 13.995, df = 1, p-value = 9.163e-05
## alternative hypothesis: true p is greater than 0.8
## 95 percent confidence interval:
## 0.8158671 1.0000000
## sample estimates:
## p
## 0.8276451
#2a. Lower tailed test for proportions
#Suppose 60% of citizens voted in last election.
#85 out of 148 people in a telephone survey said that they voted in current election.
#At 0.5 significance level, can we reject the null hypothesis that the
#proportion of voters in the population is above 60% this year?
#The null hypothesis is that p ≥ 0.6.
pbar = 85/148 # sample proportion
p0 = .6 # hypothesized value
n = 148 # sample size
z = (pbar-p0)/sqrt(p0*(1-p0)/n)
z # test statistic
## [1] -0.6375983
#Compute p value
pval = pnorm(z, lower.tail=TRUE)
pval # lower tail p−value
## [1] 0.2618676
#As it turns out to be greater than the .05 significance level,
#we do not reject the null hypothesis that p ≥ 0.6.
#We can also compute the critical value at .05 significance level.
alpha = .05
z.alpha = qnorm(1-alpha)
-z.alpha # critical value (take - z alpha)
## [1] -1.644854
#The test statistic -0.6376 is not less than the critical value of -1.6449.
#Hence, at .05 significance level, we do not reject the null hypothesis
#that the proportion of voters in the population is above 60% this year.
#2b. Upper tailed test for proportions
#Suppose that 12% of apples harvested in an orchard last year was rotten.
#30 out of 214 apples in a harvest sample this year turns out to be rotten.
#At .05 significance level, can we reject the null hypothesis that
#the proportion of rotten apples in harvest stays below 12% this year?
#The null hypothesis is that p ≤ 0.12.
pbar = 30/214 # sample proportion
p0 = .12 # hypothesized value
n = 214 # sample size
z = (pbar-p0)/sqrt(p0*(1-p0)/n)
z # test statistic
## [1] 0.908751
#Compute p-value
pval = pnorm(z, lower.tail=FALSE)
pval # upper tail p−value
## [1] 0.1817408
#As it turns out to be greater than the .05 significance level,
#we do not reject the null hypothesis that p ≤ 0.12.
#We can also compute the critical value at .05 significance level.
alpha = .05
z.alpha = qnorm(1-alpha)
z.alpha # critical value
## [1] 1.644854
#The test statistic 0.90875 is not greater than the critical value of 1.6449.
#Hence, at .05 significance level, we do not reject the null hypothesis that
#the proportion of rotten apples in harvest stays below 12% this year.
#2c. Two Tailed Test for proportions
#Suppose a coin toss turns up 12 heads out of 20 trials. At .05 significance level,
#can one reject the null hypothesis that the coin toss is fair?
#The null hypothesis is that p = 0.5.
pbar = 12/20 # sample proportion
p0 = .5 # hypothesized value
n = 20 # sample size
z = (pbar-p0)/sqrt(p0*(1-p0)/n)
z # test statistic
## [1] 0.8944272
# Compute p-value
pval = 2 * pnorm(z, lower.tail=FALSE) # upper tail
pval # two−tailed p−value
## [1] 0.3710934
#It doubles the upper tail p-value as the sample proportion
#is greater than the hypothesized value.
#Since it turns out to be greater than the .05 significance level,
#we do not reject the null hypothesis that p = 0.5.
#We can compute the critical values at .05 significance level.
alpha = .05
z.half.alpha = qnorm(1-alpha/2)
c(-z.half.alpha, z.half.alpha)
## [1] -1.959964 1.959964
#The test statistic 0.89443 lies between the critical values -1.9600 and 1.9600.
#Hence, at .05 significance level, we do not reject the null hypothesis
#that the coin toss is fair.
Week 5: Correlation and Linear Regression
#CORRELATION#
# scatterplot between Sale Price and Above Ground Living Area
ggplot(ames) +
aes(x = Gr.Liv.Area, y = SalePrice) +
geom_point(colour = "#0c4c8a") +
theme_minimal()

#We note we can remove some outliers from the dataset.
#In particular take out homes larger than 4000 sqft so we won't use the homes that large
ames2 = ames[-which(ames$Gr.Liv.Area>4000 & ames$SalePrice<800000),]
ggplot(ames2) +
aes(x = Gr.Liv.Area, y = SalePrice) +
geom_point(colour = "#0c4c8a") +
theme_minimal()

#Correlation Coefficient
cor(ames2$SalePrice, ames2$Gr.Liv.Area)
## [1] 0.7194627
# Pearson correlation test(hypothesis test to decide if the value of the
#population correlation coefficient (rho or ρ) is close to 0 or significantly different from 0
test <- cor.test(ames2$SalePrice, ames2$Gr.Liv.Area)
test
##
## Pearson's product-moment correlation
##
## data: ames2$SalePrice and ames2$Gr.Liv.Area
## t = 56.006, df = 2923, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.7015122 0.7365008
## sample estimates:
## cor
## 0.7194627
#SIMPLE LINEAR REGRESSION#
#Model a linear relationship between the total above ground living space of a home (Gr_Liv_Area)
#and sale price (Sale_Price). To perform an OLS regression model in R we can use the lm() function:
#model1 <- lm(Sale_Price ~ Gr_Liv_Area, data = ames_train)
model1 <- lm(SalePrice ~ Gr.Liv.Area, data = ames2)
#We can also use summary() to get a more detailed report of the model results.
summary(model1)
##
## Call:
## lm(formula = SalePrice ~ Gr.Liv.Area, data = ames2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -201358 -30336 -1655 23303 330127
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6773.484 3260.418 2.077 0.0378 *
## Gr.Liv.Area 116.225 2.075 56.006 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 54570 on 2923 degrees of freedom
## Multiple R-squared: 0.5176, Adjusted R-squared: 0.5175
## F-statistic: 3137 on 1 and 2923 DF, p-value: < 2.2e-16
str(model1)
## List of 12
## $ coefficients : Named num [1:2] 6773 116
## ..- attr(*, "names")= chr [1:2] "(Intercept)" "Gr.Liv.Area"
## $ residuals : Named num [1:2925] -26964 -79768 -130433 -79049 -42697 ...
## ..- attr(*, "names")= chr [1:2925] "1" "2" "3" "4" ...
## $ effects : Named num [1:2925] -9757248 -3056117 -127159 -77680 -39851 ...
## ..- attr(*, "names")= chr [1:2925] "(Intercept)" "Gr.Liv.Area" "" "" ...
## $ rank : int 2
## $ fitted.values: Named num [1:2925] 197964 218768 289433 215049 272697 ...
## ..- attr(*, "names")= chr [1:2925] "1" "2" "3" "4" ...
## $ assign : int [1:2] 0 1
## $ qr :List of 5
## ..$ qr : num [1:2925, 1:2] -54.0833 0.0185 0.0185 0.0185 0.0185 ...
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ : chr [1:2925] "1" "2" "3" "4" ...
## .. .. ..$ : chr [1:2] "(Intercept)" "Gr.Liv.Area"
## .. ..- attr(*, "assign")= int [1:2] 0 1
## ..$ qraux: num [1:2] 1.02 1.01
## ..$ pivot: int [1:2] 1 2
## ..$ tol : num 1e-07
## ..$ rank : int 2
## ..- attr(*, "class")= chr "qr"
## $ df.residual : int 2923
## $ xlevels : Named list()
## $ call : language lm(formula = SalePrice ~ Gr.Liv.Area, data = ames2)
## $ terms :Classes 'terms', 'formula' language SalePrice ~ Gr.Liv.Area
## .. ..- attr(*, "variables")= language list(SalePrice, Gr.Liv.Area)
## .. ..- attr(*, "factors")= int [1:2, 1] 0 1
## .. .. ..- attr(*, "dimnames")=List of 2
## .. .. .. ..$ : chr [1:2] "SalePrice" "Gr.Liv.Area"
## .. .. .. ..$ : chr "Gr.Liv.Area"
## .. ..- attr(*, "term.labels")= chr "Gr.Liv.Area"
## .. ..- attr(*, "order")= int 1
## .. ..- attr(*, "intercept")= int 1
## .. ..- attr(*, "response")= int 1
## .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
## .. ..- attr(*, "predvars")= language list(SalePrice, Gr.Liv.Area)
## .. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "numeric"
## .. .. ..- attr(*, "names")= chr [1:2] "SalePrice" "Gr.Liv.Area"
## $ model :'data.frame': 2925 obs. of 2 variables:
## ..$ SalePrice : int [1:2925] 171000 139000 159000 136000 230000 190000 200000 153337 148325 269500 ...
## ..$ Gr.Liv.Area: int [1:2925] 1645 1824 2432 1792 2288 2544 2634 1224 1229 2787 ...
## ..- attr(*, "terms")=Classes 'terms', 'formula' language SalePrice ~ Gr.Liv.Area
## .. .. ..- attr(*, "variables")= language list(SalePrice, Gr.Liv.Area)
## .. .. ..- attr(*, "factors")= int [1:2, 1] 0 1
## .. .. .. ..- attr(*, "dimnames")=List of 2
## .. .. .. .. ..$ : chr [1:2] "SalePrice" "Gr.Liv.Area"
## .. .. .. .. ..$ : chr "Gr.Liv.Area"
## .. .. ..- attr(*, "term.labels")= chr "Gr.Liv.Area"
## .. .. ..- attr(*, "order")= int 1
## .. .. ..- attr(*, "intercept")= int 1
## .. .. ..- attr(*, "response")= int 1
## .. .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
## .. .. ..- attr(*, "predvars")= language list(SalePrice, Gr.Liv.Area)
## .. .. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "numeric"
## .. .. .. ..- attr(*, "names")= chr [1:2] "SalePrice" "Gr.Liv.Area"
## - attr(*, "class")= chr "lm"
#You can also get the elements from the following
#coef <- coefficients(model1) # coefficients
#resid <- residuals(model1) # residuals
#pred <- predict(model1) # fitted values
#rsq <- summary(model1)$r.squared # R-sq for the fit
#se <- summary(model1)$sigma # se of the fit
#To get the statistics of the coefficients, you need to use summary:
#stat.coef <- summary(fit)$coefficients
#coef <- stat.coef[,1] # 1st column: coefficients (same as above)
#se.coef <- stat.coef[,2] # 2nd column: se for each coef
#t.coef <- stat.coef[,3] # 3rd column: t-value for each coef
#p.coef <- stat.coef[,4] # 4th column: p-value for each coefficient
#Estimate of Error variance
#Typically, these error metrics are computed on a separate validation set or
#using cross-validation (ML course);
#however, they can also be computed on the same training data the model was trained on
sigma(model1) # RMSE
## [1] 54568.13
sigma(model1)^2 # MSE
## [1] 2977680331
#Note that the RMSE is also reported as the Residual standard error in the output from summary().
#We can construct such (one-at-a-time) confidence intervals for each coefficient using confint().
#For example, a 95% confidence intervals for the coefficients in our SLR example can be computed using
confint(model1, level = 0.95)
## 2.5 % 97.5 %
## (Intercept) 380.5340 13166.4335
## Gr.Liv.Area 112.1562 120.2944
#Linear regression plots
plot(model1)




#Fitting New Data (use the following line of code)
#predict(model1, myenewdata)
##########TRANSFORMATION OF RESPONSE############
logSalePrice <-log(ames2$SalePrice)
ggplot(ames2) +
aes(x = Gr.Liv.Area, y = logSalePrice) +
geom_point(colour = "#0c4c8a") +
theme_minimal()

model2 <- lm(logSalePrice ~ Gr.Liv.Area, data = ames2)
summary(model2)
##
## Call:
## lm(formula = logSalePrice ~ Gr.Liv.Area, data = ames2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.17037 -0.15249 0.03129 0.16804 0.90367
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.113e+01 1.705e-02 652.80 <2e-16 ***
## Gr.Liv.Area 5.939e-04 1.085e-05 54.72 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2854 on 2923 degrees of freedom
## Multiple R-squared: 0.506, Adjusted R-squared: 0.5058
## F-statistic: 2994 on 1 and 2923 DF, p-value: < 2.2e-16
coef <- coefficients(model2) # coefficients
finalcoeff = exp(coef)
finalcoeff
## (Intercept) Gr.Liv.Area
## 68360.496528 1.000594
#For 100 square feet increase in above ground living area, exponentiate the coeff*100
exp(.0005939377*100)
## [1] 1.061193