Statistics for Business Analytics Cheat Sheet

Read Data

#use read.csv() to read in data
ames <- read.csv("/Users/KAP/Documents/School/02 - MSBA/2 - Statistics for Business Analytics/AmesHousing2.csv")

Week 1: Summarizing Data and Sampling

#*****SUMMARIZE DATA*****#

#Numeric Variables
#The func mean() and median() is used to compute mean/median 
mean(ames$SalePrice)

## [1] 180796.1

median(ames$SalePrice)

## [1] 160000

#Mean of multiple columns using colMeans() function
colMeans(ames[sapply(ames, is.numeric)])

##          Order   Lot.Frontage       Lot.Area   Overall.Qual   Overall.Cond 
##   1.465500e+03             NA   1.014792e+04   6.094881e+00   5.563140e+00 
##     Year.Built Year.Remod.Add    Bsmt.Unf.SF  Total.Bsmt.SF    BaseLivArea 
##   1.971356e+03   1.984267e+03             NA             NA   4.921840e+02 
##    X1st.Flr.SF    X2nd.Flr.SF    Gr.Liv.Area      Full.Bath      Half.Bath 
##   1.159558e+03   3.354560e+02   1.499690e+03   1.566553e+00   3.795222e-01 
##      Bathrooms  Bedroom.AbvGr  Kitchen.AbvGr  TotRms.AbvGrd     Fireplaces 
##   1.756314e+00   2.854266e+00   1.044369e+00   6.443003e+00   5.993174e-01 
##    Garage.Area   Wood.Deck.SF  Open.Porch.SF Enclosed.Porch    X3Ssn.Porch 
##             NA   9.375188e+01   4.753345e+01   2.301160e+01   2.592491e+00 
##   Screen.Porch        Mo.Sold        Yr.Sold      SalePrice 
##   1.600205e+01   6.216041e+00   2.007790e+03   1.807961e+05

#If you have any missing values that should be ignored
colMeans(ames[sapply(ames, is.numeric)], na.rm=TRUE)

##          Order   Lot.Frontage       Lot.Area   Overall.Qual   Overall.Cond 
##   1.465500e+03   6.922459e+01   1.014792e+04   6.094881e+00   5.563140e+00 
##     Year.Built Year.Remod.Add    Bsmt.Unf.SF  Total.Bsmt.SF    BaseLivArea 
##   1.971356e+03   1.984267e+03   5.592625e+02   1.051615e+03   4.921840e+02 
##    X1st.Flr.SF    X2nd.Flr.SF    Gr.Liv.Area      Full.Bath      Half.Bath 
##   1.159558e+03   3.354560e+02   1.499690e+03   1.566553e+00   3.795222e-01 
##      Bathrooms  Bedroom.AbvGr  Kitchen.AbvGr  TotRms.AbvGrd     Fireplaces 
##   1.756314e+00   2.854266e+00   1.044369e+00   6.443003e+00   5.993174e-01 
##    Garage.Area   Wood.Deck.SF  Open.Porch.SF Enclosed.Porch    X3Ssn.Porch 
##   4.728197e+02   9.375188e+01   4.753345e+01   2.301160e+01   2.592491e+00 
##   Screen.Porch        Mo.Sold        Yr.Sold      SalePrice 
##   1.600205e+01   6.216041e+00   2.007790e+03   1.807961e+05

#Mode of a var is computed using mfv()func in the modest R package
mfv(ames$Mo.Sold)

## [1] 6

#Quantile func gives min, max and 3 quartiles
quantile(ames$SalePrice)

##     0%    25%    50%    75%   100% 
##  12789 129500 160000 213500 755000

#Compute deciles (seq gives probabilities in values between 0 and 1)
quantile(ames$SalePrice, seq(0, 1, 0.1))

##       0%      10%      20%      30%      40%      50%      60%      70% 
##  12789.0 105450.0 124000.0 135000.0 146500.0 160000.0 178536.0 199500.0 
##      80%      90%     100% 
## 230000.0 281241.7 755000.0

#Compute overall summary of a variable 
#If the column is a numeric variable, mean, median, min, max and quartiles are returned.
#Categorical variables are stored into a factor in R
#If the column is a factor variable, the number of observations in each group is returned.

summary(ames$SalePrice)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   12789  129500  160000  180796  213500  755000

#Categorical Variables (count)
table(ames$Bldg.Type)

## 
##   1Fam 2fmCon Duplex  Twnhs TwnhsE 
##   2425     62    109    101    233

#Count in percentage
table(ames$Bldg.Type)/nrow(ames)

## 
##       1Fam     2fmCon     Duplex      Twnhs     TwnhsE 
## 0.82764505 0.02116041 0.03720137 0.03447099 0.07952218

#*****SAMPLING*****#

# GENERATE A RANDOM SAMPLE OF SIZE 100
rnorm(n=100, mean = 15, sd= 6)

##   [1] 16.75622780  8.16419192 15.60728599 17.51960715 17.57796063 12.98722622
##   [7] 14.16273832 15.97625259 13.29205379 13.96767337 12.90834901 16.68244360
##  [13]  4.46487095  3.35807447 23.10103347 18.93839768 10.04724240 26.66551515
##  [19] 20.81420622 10.75944430 13.64387453 15.46035213  8.34392092 20.87654151
##  [25] 12.46609229 10.87382351 12.44834926 10.92443575 21.46025306  9.94062438
##  [31] 14.13387433 13.74577930 23.52771038 16.35024676 12.20843018  0.08748966
##  [37]  5.81357682 16.01764646 18.06445810 20.52347297  9.17207574 20.62620500
##  [43] 21.31819120  7.08327746  3.41010815 16.91708732 13.72520858  9.08926876
##  [49] 14.95325486 21.66039494 12.00784910  8.37056396 16.59159429 10.55510419
##  [55]  8.33446953 21.85680913  4.80748742 14.39455793 23.04420476 18.76003843
##  [61] 16.70052040  5.56025373 18.68000090  8.43453507 18.73700796  3.84706878
##  [67] 16.61942229 15.87927552  9.13172069  6.09697766 19.79429457 20.38876310
##  [73] 15.94359014  7.76213003  9.81809185 17.26651516 12.50890153 18.59172966
##  [79] 15.85644772 22.86546997 12.92925633 22.53647892 20.24181925 17.27501869
##  [85] 16.47940169 18.74982297 15.54325285 19.89758151 17.92858534  7.94706655
##  [91] 19.47092910 29.71529770 11.44714126 13.60318841  8.37474176 22.72923315
##  [97] 24.08987315 10.84944888  8.71826703 23.65582484

#There’s quite a number of sampling methods we can undertake in R 

#SIMPLE RANDOM SAMPLE
#Create a vector of 100 random numbers puled from the normal distribution

nd <- rnorm(100)

#Sample 10 of those numbers at random (SRS)
#Use the sample function is used to generate a random sample from a given population
sample(nd, 10)

##  [1]  0.05662768  2.41552638 -1.22220786 -0.15597747 -1.04228030  0.45661309
##  [7] -0.73608084 -1.03843872  0.74064982  0.07551901

#Selecting ramdom sample from a data frame (AMES DATA)
#Two step process; sample all the row vectors and select 10 randomly

index <-sample(1:nrow(ames), 10)

#Subset the dataset using this index to get random sample
ames[index, ]

##      Order Lot.Frontage Lot.Area Lot.Shape Utilities Lot.Config Land.Slope
## 827   2217           NA    11275       IR1    AllPub     Corner        Mod
## 2166  1728           59    15810       IR1    AllPub     Inside        Gtl
## 2537  2300           69     7599       Reg    AllPub     Corner        Gtl
## 247    533           65     8127       IR1    AllPub     Inside        Gtl
## 1371   470           51     3635       Reg    AllPub     Inside        Gtl
## 700   1812           NA    11613       IR2    AllPub     Corner        Gtl
## 2368  2048           59     4484       IR1    AllPub     Corner        Gtl
## 2614  2487           70     9100       Reg    AllPub     Inside        Gtl
## 1547   753           92     5520       Reg    AllPub     Corner        Gtl
## 2663  2568           85     8500       Reg    AllPub     Inside        Gtl
##      Neighborhood Bldg.Type House.Style Overall.Qual Overall.Cond Year.Built
## 827       Crawfor      1Fam      1.5Fin            6            7       1932
## 2166      Gilbert      1Fam      2Story            6            5       2007
## 2537      Mitchel      1Fam      1Story            4            6       1982
## 247       Somerst      1Fam      2Story            7            5       2003
## 1371      Blmngtn    TwnhsE      1Story            7            5       2007
## 700       SawyerW      1Fam      2Story            6            5       1993
## 2368        SWISU    2fmCon      1.5Fin            5            6       1942
## 2614       Sawyer    Duplex      1Story            5            5       1963
## 1547      OldTown      1Fam      2.5Fin            6            6       1912
## 2663        NAmes      1Fam      1Story            5            3       1961
##      Year.Remod.Add Foundation Bsmt.Unf.SF Total.Bsmt.SF BaseLivArea
## 827            1950     CBlock           0           854         854
## 2166           2007     CBlock         768           768           0
## 2537           2006     CBlock         280           845         565
## 247            2003      PConc         402           812         410
## 1371           2007      PConc         398          1386         988
## 700            1997      PConc         384           864         480
## 2368           1979     CBlock         187           672         485
## 2614           1963     CBlock         396          1728        1332
## 1547           1950     CBlock         755           755           0
## 2663           1961     CBlock         635          1235         600
##      Central.Air X1st.Flr.SF X2nd.Flr.SF Gr.Liv.Area Full.Bath Half.Bath
## 827            Y        1096         895        1991         1         1
## 2166           Y         768         728        1496         3         0
## 2537           Y         845           0         845         1         0
## 247            Y         812         841        1653         2         1
## 1371           Y        1569           0        1569         2         0
## 700            Y         920         900        1820         2         1
## 2368           N         778         504        1282         1         0
## 2614           Y        1728           0        1728         2         0
## 1547           Y         929         929        2229         1         0
## 2663           Y        1235           0        1235         1         0
##      Bathrooms Bedroom.AbvGr Kitchen.AbvGr Kitchen.Qual TotRms.AbvGrd
## 827        1.5             3             1           TA             7
## 2166       3.0             3             1           Gd             7
## 2537       1.0             2             1           TA             4
## 247        2.5             3             1           Gd             6
## 1371       2.0             1             1           Gd             7
## 700        2.5             3             1           Gd             7
## 2368       1.0             2             1           TA             4
## 2614       2.0             6             2           TA            10
## 1547       1.0             5             1           TA             8
## 2663       1.0             2             1           TA             6
##      Fireplaces Garage.Type Garage.Area Wood.Deck.SF Open.Porch.SF
## 827           1      Detchd         432            0             0
## 2166          0      Attchd         572          100             0
## 2537          0      Detchd         360            0             0
## 247           0      Attchd         628            0            45
## 1371          1      Attchd         660          143            20
## 700           0      Attchd         492          144            85
## 2368          0      Detchd         240            0            88
## 2614          0      Detchd         504            0             0
## 1547          0        <NA>           0            0           198
## 2663          0      Attchd         480            0             0
##      Enclosed.Porch X3Ssn.Porch Screen.Porch Mo.Sold Yr.Sold Sale.Condition
## 827              19           0            0       3    2007         Normal
## 2166              0           0            0       5    2007        Partial
## 2537              0           0            0       6    2007         Normal
## 247               0           0            0       3    2009         Normal
## 1371              0           0            0       5    2009         Normal
## 700               0           0            0       5    2007         Normal
## 2368              0           0            0       7    2007         Normal
## 2614              0           0            0      11    2006        Abnorml
## 1547             30           0            0       7    2009        Abnorml
## 2663              0           0            0      12    2006        Abnorml
##      SalePrice
## 827     220000
## 2166    181755
## 2537    129500
## 247     209000
## 1371    175900
## 700     191750
## 2368    108500
## 2614    125000
## 1547    104000
## 2663     98600

#STRATIFIED RANDOM SAMPLE
#In package dplyr, use group_by function for the strata and sample_n for no. of samples in each group
ames%>%
  group_by(Bldg.Type)%>%
  sample_n(5)

## # A tibble: 25 x 41
## # Groups:   Bldg.Type [5]
##    Order Lot.Frontage Lot.Area Lot.Shape Utilities Lot.Config Land.Slope
##    <int>        <int>    <int> <chr>     <chr>     <chr>      <chr>     
##  1   319           92    12003 Reg       AllPub    Corner     Gtl       
##  2   427           98    12704 Reg       AllPub    Inside     Gtl       
##  3  1285           30     5232 IR3       AllPub    Inside     Gtl       
##  4  2866           70    12320 IR1       AllPub    Inside     Gtl       
##  5   343           85    10200 Reg       AllPub    Inside     Gtl       
##  6  1415           70     7000 Reg       AllPub    Inside     Gtl       
##  7  1516           55     5687 Reg       AllPub    Inside     Gtl       
##  8  2871           90    15750 Reg       AllPub    Corner     Gtl       
##  9  2283           NA    32463 Reg       AllPub    Inside     Mod       
## 10  2524           79    13110 IR1       AllPub    Corner     Gtl       
## # … with 15 more rows, and 34 more variables: Neighborhood <chr>,
## #   Bldg.Type <chr>, House.Style <chr>, Overall.Qual <int>, Overall.Cond <int>,
## #   Year.Built <int>, Year.Remod.Add <int>, Foundation <chr>,
## #   Bsmt.Unf.SF <int>, Total.Bsmt.SF <int>, BaseLivArea <int>,
## #   Central.Air <chr>, X1st.Flr.SF <int>, X2nd.Flr.SF <int>, Gr.Liv.Area <int>,
## #   Full.Bath <int>, Half.Bath <int>, Bathrooms <dbl>, Bedroom.AbvGr <int>,
## #   Kitchen.AbvGr <int>, Kitchen.Qual <chr>, TotRms.AbvGrd <int>,
## #   Fireplaces <int>, Garage.Type <chr>, Garage.Area <int>, Wood.Deck.SF <int>,
## #   Open.Porch.SF <int>, Enclosed.Porch <int>, X3Ssn.Porch <int>,
## #   Screen.Porch <int>, Mo.Sold <int>, Yr.Sold <int>, Sale.Condition <chr>,
## #   SalePrice <int>

#SYSTEMATIC SAMPLE
#Write a function indicating the step k that has to be taken in the dataset
#N is the total rows, n is every nth number, k = N/n the step
#seq function ties the arguments to generate sequence of numbers
#sample randomly reorders the elements and picks one

obtain_sys = function(N,n){
  k=ceiling(N/n)
  r=sample(1:k, 1)
  seq(r, r+k*(n-1), k)
}

sys.sample = ames[obtain_sys(nrow(ames), 10), ]
head(sys.sample)

##      Order Lot.Frontage Lot.Area Lot.Shape Utilities Lot.Config Land.Slope
## 121    246           83    11980       Reg    AllPub     Inside        Mod
## 414   1048           24     2308       Reg    AllPub        FR2        Gtl
## 707   1833           NA     9572       IR1    AllPub     Inside        Gtl
## 1000  2593           94    22136       Reg    AllPub     Inside        Gtl
## 1293   322           89    13214       IR1    AllPub     Inside        Gtl
## 1586   803           90    14684       IR1    AllPub    CulDSac        Gtl
##      Neighborhood Bldg.Type House.Style Overall.Qual Overall.Cond Year.Built
## 121       SawyerW      1Fam      1Story            7            5       1987
## 414       NPkVill    TwnhsE      2Story            6            6       1975
## 707       NoRidge      1Fam      2Story            8            5       1990
## 1000      OldTown    2fmCon      1.5Fin            5            5       1925
## 1293       Timber      1Fam      1Story            9            5       2008
## 1586      SawyerW      1Fam      1Story            7            7       1990
##      Year.Remod.Add Foundation Bsmt.Unf.SF Total.Bsmt.SF BaseLivArea
## 121            1987     CBlock           0          1433        1433
## 414            1975     CBlock         275           855         580
## 707            1990      PConc         971          1453         482
## 1000           1975     CBlock        1153          2171        1018
## 1293           2009      PConc        2002          2002           0
## 1586           1991     CBlock        1496          2158         662
##      Central.Air X1st.Flr.SF X2nd.Flr.SF Gr.Liv.Area Full.Bath Half.Bath
## 121            Y        1433           0        1433         1         1
## 414            Y         855         601        1456         2         1
## 707            Y        1453        1357        2810         2         1
## 1000           Y        1392        1248        2640         2         1
## 1293           Y        2018           0        2018         2         0
## 1586           Y        2196           0        2196         2         0
##      Bathrooms Bedroom.AbvGr Kitchen.AbvGr Kitchen.Qual TotRms.AbvGrd
## 121        1.5             1             1           Gd             4
## 414        2.5             4             1           TA             7
## 707        2.5             4             1           Gd             9
## 1000       2.5             5             1           TA            10
## 1293       2.0             3             1           Ex            10
## 1586       2.0             3             1           Gd             7
##      Fireplaces Garage.Type Garage.Area Wood.Deck.SF Open.Porch.SF
## 121           2      Attchd         528            0           278
## 414           0      Attchd         460            0             0
## 707           1      Attchd         750          500             0
## 1000          1      Attchd        1008          631            48
## 1293          1      Attchd         746          144            76
## 1586          1      Attchd         701           84            70
##      Enclosed.Porch X3Ssn.Porch Screen.Porch Mo.Sold Yr.Sold Sale.Condition
## 121               0           0          266       6    2010         Normal
## 414               0           0            0       5    2008         Normal
## 707               0           0            0       6    2007         Normal
## 1000            148           0            0       7    2006         Normal
## 1293              0           0            0       5    2010         Normal
## 1586              0           0            0       6    2009         Normal
##      SalePrice
## 121     270000
## 414     143000
## 707     302000
## 1000    180000
## 1293    378500
## 1586    271900

Week 2: Spread of Data, Contingency Tables, and Simple Plots

#*****SPREAD OF DATA*****#
# Spread (standard deviation, variance, IQR, Range)
# Skewness, Kurtosis

summary(ames$SalePrice)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   12789  129500  160000  180796  213500  755000

#summary for group of variables (use na.rm for removing NAs)
#The piped function %>% is used from dplyr library

ames %>%
  summarise(
    count = n(),
    mean_SP = mean(SalePrice, na.rm = TRUE),
    mean_GA = mean(Gr.Liv.Area, na.rm = TRUE)
  )

##   count  mean_SP mean_GA
## 1  2930 180796.1 1499.69

print(var(ames$SalePrice))

## [1] 6381883616

print(sd(ames$SalePrice))

## [1] 79886.69

print(range(ames$SalePrice))

## [1]  12789 755000

print(max(ames$SalePrice)-min(ames$SalePrice))

## [1] 742211

print(IQR(ames$SalePrice))

## [1] 84000

ames %>%
  summarise(
    count = n(),
    sd_SP = sd(SalePrice, na.rm = TRUE),
    sd_GA = sd(Gr.Liv.Area, na.rm = TRUE)
  )

##   count    sd_SP    sd_GA
## 1  2930 79886.69 505.5089

#Skewness and Kurtosis can be analyzed by using library(moments)

skewness(ames$SalePrice, na.rm = TRUE) #Right skewed, <-1 or >+1 highly skewed

## [1] 1.742607

kurtosis(ames$SalePrice, na.rm = TRUE) #Greater than 3 so Leptokurtic distribution - fat tails

## [1] 8.108122

ames %>%
  summarise(
    count = n(),
    skew_SP = skewness(SalePrice, na.rm = TRUE),
    skew_GA = skewness(Gr.Liv.Area, na.rm = TRUE)
  )

##   count  skew_SP  skew_GA
## 1  2930 1.742607 1.273457

#*****CONTINGENCY TABLE*****#
tab1<-table(ames$Kitchen.Qual, ames$Bldg.Type)
tab1

##     
##      1Fam 2fmCon Duplex Twnhs TwnhsE
##   Ex  174      0      0     2     29
##   Fa   59      8      3     0      0
##   Gd  967      3      4    39    147
##   Po    1      0      0     0      0
##   TA 1224     51    102    60     57

rowSums(tab1)

##   Ex   Fa   Gd   Po   TA 
##  205   70 1160    1 1494

colSums(tab1)

##   1Fam 2fmCon Duplex  Twnhs TwnhsE 
##   2425     62    109    101    233

#in Percents of column variable (use margin=2 for % by col, margin=1 for % by row)
#prop.table nested with table gives frequencies by %

prop.table((tab1), margin=2)*100

##     
##             1Fam      2fmCon      Duplex       Twnhs      TwnhsE
##   Ex  7.17525773  0.00000000  0.00000000  1.98019802 12.44635193
##   Fa  2.43298969 12.90322581  2.75229358  0.00000000  0.00000000
##   Gd 39.87628866  4.83870968  3.66972477 38.61386139 63.09012876
##   Po  0.04123711  0.00000000  0.00000000  0.00000000  0.00000000
##   TA 50.47422680 82.25806452 93.57798165 59.40594059 24.46351931

prop.table((tab1), margin=1)*100

##     
##             1Fam      2fmCon      Duplex       Twnhs      TwnhsE
##   Ex  84.8780488   0.0000000   0.0000000   0.9756098  14.1463415
##   Fa  84.2857143  11.4285714   4.2857143   0.0000000   0.0000000
##   Gd  83.3620690   0.2586207   0.3448276   3.3620690  12.6724138
##   Po 100.0000000   0.0000000   0.0000000   0.0000000   0.0000000
##   TA  81.9277108   3.4136546   6.8273092   4.0160643   3.8152610

#ftable function will give you 3 or more dimensions ftable(v1, v2, v3)

#*****SIMPLE PLOTS*****#
# Histogram, piechart, bar chart, scatter plot

#1. Scatter Plot
qplot(Gr.Liv.Area, SalePrice, data=ames)

qplot(Gr.Liv.Area, SalePrice, data=ames, geom=c("point", "smooth"))

#Same plot with ggplot
ggplot(ames, aes(x =Gr.Liv.Area, y = SalePrice)) +
  geom_point()+
  labs(title="Sale Price of Homes by Living Area",
       x="Living Area (Sq ft)", y = "House Price ($)")

#Getting a linear model, The alpha parameter controls transparency, with alpha=0 completely transparent
ggplot(ames, aes(x = Gr.Liv.Area, y = SalePrice)) +
  geom_point(alpha = 0.6) +
  stat_smooth(
    method = "lm",
    color = "red",
    se = FALSE
  )

# Change point shapes, colors by bldg.type
ggplot(ames, aes(x=Gr.Liv.Area, y=SalePrice, shape=Bldg.Type, color=Bldg.Type)) +
  geom_point()

#2. Histogram

qplot(data=ames, SalePrice, geom = 'histogram')

qplot(data=ames, SalePrice, geom = 'histogram', binwidth = 10000) + xlab('Price')

# Change histogram fill color by group (sex)
qplot(SalePrice, data = ames, geom = "histogram", binwidth=10000,
      fill = Bldg.Type)

qplot(SalePrice, data = ames, geom = "histogram", binwidth=10000,
      xlab = "Price of homes", ylab = "Frequency", 
      main = "Histogram of SalePrice of Homes")

#you can also use ggplot to make this same thing
ggplot(data=ames, aes(ames$SalePrice)) + 
         geom_histogram()

## Warning: Use of `ames$SalePrice` is discouraged. Use `SalePrice` instead.

#Histogram plot adjustment
qplot(ames$SalePrice, 
      geom="histogram", 
      binwidth=10000, 
      main="Histogram for SalePrice in Ames", 
      xlab="Price", 
      fill=I("blue"),
      col=I("red"),
      alpha=I(0.2),
)

#The I() function inhibits the interpretation of its arguments. 
#In this case, the col argument or fill or alpha is affected. 
#Without it, the qplot() function would print a legend, 
#for example, saying that “col =”red“”

#Add a trendline to histogram
#use ggplot for more options on the histogram such as adding a line
hist<-ggplot(data=ames, aes(x=SalePrice)) + 
  geom_histogram(binwidth=10000, color="darkblue", fill="lightblue")


#Add mean line
hist+geom_vline(aes(xintercept=mean(SalePrice)), 
             color="blue", linetype="dashed", size=1)

#Add density curve
ggplot(ames, aes(x=SalePrice))+
  geom_histogram(aes(y=..density..), color="black", fill="white")+
  geom_density(alpha=0.2, fill="#FF6666")

#By Groups
ggplot(ames, aes(x=SalePrice, color=Bldg.Type))+
  geom_histogram(binwidth=25000, fill="white", position="dodge")+
  theme(legend.position="top")

#bars one besides the other by using dodge function

# 3. Barplot

ggplot(ames, aes(x = factor(Bldg.Type))) +
  geom_bar()

#Control the barplot
#- `stat`: Control the type of formatting. By default, `bin` to plot a count in the y-axis. For continuous value, pass `stat = "identity"`
#- `alpha`: Control density of the color (0-1)
#- `fill`: Change the color of the bar
#- `size`: Control the size the bar

#Change intensity and color
# Change intensity
ggplot(ames,
       aes(factor(Bldg.Type))) +
  geom_bar(fill = "steelblue",
           alpha = 0.5) +
  theme_classic()

# Color by group
ggplot(ames, aes(factor(Bldg.Type),
                   fill = factor(Bldg.Type))) +
  geom_bar()

#Side by side bars

ggplot(ames, aes(x = Bldg.Type, fill = House.Style)) +
  geom_bar(position = position_dodge()) +
  theme_classic()

#Turn your barplot horizontal with coord_flip() function
ggplot(ames, aes(x=Bldg.Type))+
  geom_bar(fill="steelblue")+
  labs(x="Type of Building", title="Building Type at Ames")+
coord_flip()

#Sorted alphabetically
## set the levels in order we want
ames <- within(ames, 
                   Bldg.Type <- factor(Bldg.Type, 
                                      levels=names(sort(table(Bldg.Type), 
                                                        increasing=TRUE))))
ggplot(ames, aes(x=Bldg.Type))+
  geom_bar(fill="steelblue")+
  labs(x="Type of Building", title="Building Type at Ames")+
  coord_flip()

#By the Mean of a Variable SalePrice 
#You create a data frame named data_bar
#It returns the average SalePrice by the Bldg.Type 
#You call this new variable mean_SalePrice and round the mean with two decimals.
data_bar <- ames %>%
  mutate(Bldg.Type = factor(Bldg.Type)) %>%
  group_by(Bldg.Type) %>%
  summarize(mean_SalePrice = round(mean(SalePrice), 2))

bar <-ggplot(data_bar, aes(x = Bldg.Type, y = mean_SalePrice)) +
  geom_bar(stat = "identity")

#Add Labels (vjust= placement for vertical bars, hjust=horizontal bars)

bar +
  geom_text(aes(label = mean_SalePrice),
            vjust = 3.5,
            color = "yellow",
            size = 3) +
  theme_classic()

Week 3: Other Graphical Means and Confidence Intervals

# Other graphical means of representing data

############# 1. Pie Chart ############

table(ames$Bldg.Type)

## 
## 2fmCon  Twnhs Duplex TwnhsE   1Fam 
##     62    101    109    233   2425

bt_chart  <-table(ames$Bldg.Type)

pie(bt_chart)

#1a change color
pie(bt_chart,
    col=c("steelblue", "steelblue3", "steelblue2", "steelblue1", "skyblue1"))

pie(bt_chart,
    col=gray(seq(0.4, 1.0, length = 5)))

#1b. show %labels

pct <- round(bt_chart/sum(bt_chart)*100)        # calculate percentages
lbls <- paste(names(bt_chart), pct, "%")    # add percents to labels
pie(bt_chart,
    col=c("steelblue", "steelblue3", "steelblue2", "steelblue1", "skyblue1"),
    labels=lbls)

#1c. pie3D; not in basic stat package so install the plotrix package

pie3D(bt_chart,
      col=c("steelblue", "steelblue3", "steelblue2", "steelblue1", "skyblue1"),
      labels = lbls,
      labelcex = 1,
      explode=0.1,
      theta = 0.8,
      main="3D Pie Chart of House Types")

############# 2. Box Plot ############

#2a. use the boxplot() function to create boxplots

boxplot(ames$SalePrice, na.rm=TRUE)

boxplot(ames$SalePrice,
        horizontal = TRUE)

#side-by-side box plots

boxplot(SalePrice ~ Bldg.Type, data=ames, col='lightblue')

#You can change the colors of individual boxes by passing a vector of colors to the col argument

boxplot(SalePrice ~ Bldg.Type, data=ames, 
        col=c('lightblue', 'yellow', 'orange', 'lightgreen', 'coral'))

#2b. you can use ggplot2

ggplot(data=ames, aes(SalePrice)) + 
  geom_boxplot()

ggplot(data=ames, aes(SalePrice)) + 
  geom_boxplot() +
  coord_flip()

ggplot(data=ames, aes(x=factor(Bldg.Type), y=SalePrice)) +
  geom_boxplot()

ggplot(data=ames, aes(x=factor(Bldg.Type), y=SalePrice, fill=factor(Bldg.Type))) +
  geom_boxplot()

ggplot(data=ames, aes(x=factor(Bldg.Type), y=SalePrice)) +
  geom_boxplot(fill="lightblue1", color="dodgerblue")

ggplot(data=ames, aes(x=factor(Bldg.Type), y=SalePrice)) +
  geom_boxplot(outlier.size=3, outlier.shape=18, outlier.color="red")

#2c. Identifying outliers
#ggplot defines an outlier by default as something that's > 1.5*IQR from the borders of the box

is_outlier <- function(x) {
  return(x < quantile(x, 0.25) - 1.5 * IQR(x) | x > quantile(x, 0.75) + 1.5 * IQR(x))
}

dat <- ames %>% tibble::rownames_to_column(var="outlier") %>% 
  group_by(Bldg.Type) %>% 
  mutate(is_outlier=ifelse(is_outlier(SalePrice), SalePrice, as.numeric(NA)))

dat$outlier[which(is.na(dat$is_outlier))] <- as.numeric(NA)

ggplot(dat, aes(y=SalePrice, x=factor(Bldg.Type))) + 
  geom_boxplot() + 
  geom_text(aes(label=outlier),na.rm=TRUE,nudge_y=0.05)

############# 3. Q-Q Plot ############
#Is the variable normally distributed? 
#A quantile-quantile plot (QQ plot) is a good first check
#It shows the distribution of the data against the expected normal distribution
#Plot normal QQ plot using the qqnorm() function= plots your sample against a normal distribution.
#qqline() function = adds a theoretical distribution line to your normal QQ plot
#This line makes it a lot easier to evaluate whether the points deviate from the reference line.
#Closer the points are to the reference line, the closer the sample data follows a normal distribution.


with(ames, {
  qqnorm(SalePrice);
  qqline(SalePrice)})

#If the data is normally distributed, the points fall on the 45° reference line
#If the data is non-normal, the points deviate noticeably from the reference line

#3a. The standard qqplot() functions do not provide confidence intervals 
#but the qqplot() function from the car package does

with(ames, qqPlot(SalePrice))

## [1] 683 676

#gives you two outlier observations


#3b. For fun, take the log of SalePrice
sp_Log <- log(ames$SalePrice)
hist(sp_Log)

with(ames, {
  qqnorm(sp_Log);
  qqline(sp_Log)})

with(ames, qqPlot(sp_Log))

## [1] 1193 2060

############# 4. Confidence Interval for Mean############

#Compute mean for the sale Price and living area variable

mean_SP <- mean(ames$SalePrice, na.rm = TRUE)
mean_GA <- mean(ames$Gr.Liv.Area, na.rm = TRUE)

mean_SP

## [1] 180796.1

mean_GA

## [1] 1499.69

#compute Confidence Interval for a mean by adding and subtracting margin of error to point estimate

n=length(ames$SalePrice)
n

## [1] 2930

se_SP <- sd(ames$SalePrice, na.rm = TRUE)/sqrt(n)
se_GA <- sd(ames$Gr.Liv.Area, na.rm = TRUE)/sqrt(n)

se_SP

## [1] 1475.845

se_GA

## [1] 9.338884

#Em= Margin of Error for mean for both sides of the tails (.025 on both sides of the distribution)
#Since there are two tails of the normal distribution, the 95% confidence level 
#would imply the 97.5th percentile of the normal distribution at the upper and lower tail

Em <- qt(.975, df=(n-1))*(se_SP)
Em

## [1] 2893.798

lower_SP <-mean_SP -Em
upper_SP <-mean_SP +Em


lower_GA <-mean_GA -Em
upper_GA <-mean_GA +Em

c(lower_SP, upper_SP)

## [1] 177902.3 183689.9

c(lower_GA, upper_GA)

## [1] -1394.108  4393.489

#This is an important inference; even though the population is unknown, 
#we are 95% confident that true average sale price of the house and the size of houses in AMES
#lie between the values lower and upper

#There are a few conditions that have to be met for this interval to be valid
#1. Sample size >30 and an independent sample
#2. somewhat normal and unimodal population

############# 5. Confidence Interval for Proportion############

#proportion of single family homes (Bldg.Type)

n = length(ames$Bldg.Type)
n

## [1] 2930

table(ames$Bldg.Type)

## 
## 2fmCon  Twnhs Duplex TwnhsE   1Fam 
##     62    101    109    233   2425

k = sum(ames$Bldg.Type =="1Fam", na.rm=TRUE)
k

## [1] 2425

pbar=k/n
pbar

## [1] 0.8276451

#82.7% homes single family

se_SF = sqrt(pbar* (1 - pbar)/(n))
se_SF

## [1] 0.006977505

#Since there are two tails of the normal distribution, the 95% confidence level 
#would imply the 97.5th percentile of the normal distribution at the upper tail. 
#Therefore, zα∕2 is given by qnorm(.975). 
#Hence we multiply it with the standard error estimate SE and compute the margin of error.

Ep = qnorm(0.975)*(se_SF)

pbar + c(+Ep, -Ep)

## [1] 0.8413207 0.8139694

#ALTERNATIVE SOLUTION USING PROP.TEST built in stat
#Tests for null difference of proportion of Single family homes against 0.50 (null)
prop.test(k, n)

## 
##  1-sample proportions test with continuity correction
## 
## data:  k out of n, null probability 0.5
## X-squared = 1256.8, df = 1, p-value < 2.2e-16
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
##  0.8133668 0.8410546
## sample estimates:
##         p 
## 0.8276451

#There is a condition for this interval to be valid:
#1. Sampling distribution of proportions should be normal or np and n(1-p)>= 5
#2. Data comes from a random sample

a1=n*(pbar)
a2=n*(1-(pbar))

a1

## [1] 2425

a2

## [1] 505

Week 4: Hypothesis Testing - Means and Proportions

#### 1. Test for means####
#Typical value of homes in Iowa is $158156 (zillow.com)
#Mean value of homes in Ames, Iowa differs from the value in Iowa

#one sample t-test 
#two-tailed test
res <-t.test(ames$SalePrice, mu=158156)
res

## 
##  One Sample t-test
## 
## data:  ames$SalePrice
## t = 15.34, df = 2929, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 158156
## 95 percent confidence interval:
##  177902.3 183689.9
## sample estimates:
## mean of x 
##  180796.1

#here t is the t-test statistic value t=15.34, df=2929, p-value=0.000
#mean of x is 180796 and confidence interval is given as well

#test whether mean value of homes in Ames is less than 158,156
#lower test for means

t.test(ames$SalePrice, mu=158156, alternative="less")

## 
##  One Sample t-test
## 
## data:  ames$SalePrice
## t = 15.34, df = 2929, p-value = 1
## alternative hypothesis: true mean is less than 158156
## 95 percent confidence interval:
##      -Inf 183224.4
## sample estimates:
## mean of x 
##  180796.1

#test whether the mean value of homes in Ames is greater than 158,156

t.test(ames$SalePrice, mu=158156, alternative="greater")

## 
##  One Sample t-test
## 
## data:  ames$SalePrice
## t = 15.34, df = 2929, p-value < 2.2e-16
## alternative hypothesis: true mean is greater than 158156
## 95 percent confidence interval:
##  178367.7      Inf
## sample estimates:
## mean of x 
##  180796.1

#1a. Lower Test for means
#Suppose the manufacturer claims that the mean lifetime of a light bulb is more than 10,000 hours. 
#In a sample of 50 light bulbs, it was found that they only last 9,900 hours on average. 
#Assume the population standard deviation is 120 hours. 
#At .05 significance level, can we reject the claim by the manufacturer?

#Null hypothesis: μ ≥ 10000
xbar=9900             #sample mean
mu0=10000             #hypothesized value
sigma=120             #population standard deviation
n=50                  #sample size

t= (xbar-mu0)/(sigma/sqrt(n))
t                     #test statistic

## [1] -5.892557

pval = pt(t, df=49, lower.tail=TRUE)
pval                  #lower tail p value (pt gives the probability distribution function)

## [1] 1.712419e-07

#we find that the p value is less than 0.05 and thus, we reject the null that μ ≥ 10000

#critical value (if test statistic is less than this value, test is statistically sig)
alpha=0.05
t.alpha=qt(alpha, 49, lower.tail=TRUE) #use the quartile function to get the critical value at alpha)
t.alpha

## [1] -1.676551

#reject the null since the test statistic is lower than the critical value of -1.676)


#1b. Upper tailed test for means
#Suppose the food label on a cookie bag states that there is at most 2 grams of saturated fat 
#in a single cookie. In a sample of 65 cookies, it is found that the mean amount 
#of saturated fat per cookie is 2.1 grams. 
#Assume that the population standard deviation is 0.25 grams. 
#At .05 significance level, can we reject the claim on food label?

#The null hypothesis is μ ≤ 2

xbar=2.1              #sample mean
mu0=2                 #hypothesized value
sigma=0.25            #population standard deviation
n=65                  #sample size
t=(xbar-mu0)/(sigma/sqrt(n))
t                     #test statistic

## [1] 3.224903

pvalue=pt(t, df=64, lower.tail=FALSE)
pvalue                #upper tail p-value

## [1] 0.000992899

#p value is 0.0009 so we reject the null 

#critical value
alpha=0.05
t.alpha=qt(alpha, 64, lower.tail=FALSE)
t.alpha

## [1] 1.669013

# The test statistic value (3.22) should be larger than the t.alpha value (critical value, 1.669)
#to reject the null hypothesis. we reject the null. 



#1c. Two-tailed test for means
#Suppose the mean weight of King Penguins found in an Antarctic colony last year was 15.4 kg. 
#In a sample of 125 penguins same time this year in the same colony, 
#the mean penguin weight is 14.6 kg. 
#Assume the population standard deviation is 2.5 kg. At .05 significance level, 
#can we reject the null hypothesis that the mean penguin weight does not differ from last year?

#The null hypothesis is that μ = 15.4

xbar=14.6              #sample mean
mu0=15.4               #hypothesized value
sigma=2.5              #population standard deviation
n=125                  #sample size
t=(xbar-mu0)/(sigma/sqrt(n))
t                     #test statistic

## [1] -3.577709

pvalue=2*pt(t, df=124, lower.tail=TRUE)
pvalue                #two-tail p-value (lower tail=TRUE since sample mean < mu)

## [1] 0.0004953873

#p value is 0.0004 so we reject the null)

#critical value
alpha=0.05
t.alpha=qt(alpha, 124)
t.alpha

## [1] -1.657235

#Since t statistic is -3.57 is lower than critical value of t = -1.657, reject the null

#### 2. Test for proportions####
#By default uses correction factor (binomial tests corrected to normal tests)
#80% of homes in Iowa are single family

k = sum(ames$Bldg.Type =="1Fam", na.rm=TRUE)
k

## [1] 2425

n=2930
k/n

## [1] 0.8276451

prop.test(k, n, p=.80, correct=FALSE)                         #two-tailed test

## 
##  1-sample proportions test without continuity correction
## 
## data:  k out of n, null probability 0.8
## X-squared = 13.995, df = 1, p-value = 0.0001833
## alternative hypothesis: true p is not equal to 0.8
## 95 percent confidence interval:
##  0.8135426 0.8408895
## sample estimates:
##         p 
## 0.8276451

prop.test(k, n, p=0.80, correct=FALSE, alternative="less")    #lower-tailed test#

## 
##  1-sample proportions test without continuity correction
## 
## data:  k out of n, null probability 0.8
## X-squared = 13.995, df = 1, p-value = 0.9999
## alternative hypothesis: true p is less than 0.8
## 95 percent confidence interval:
##  0.0000000 0.8388184
## sample estimates:
##         p 
## 0.8276451

prop.test(k, n, p=0.80, correct=FALSE, alternative="greater") #upper-tailed test#

## 
##  1-sample proportions test without continuity correction
## 
## data:  k out of n, null probability 0.8
## X-squared = 13.995, df = 1, p-value = 9.163e-05
## alternative hypothesis: true p is greater than 0.8
## 95 percent confidence interval:
##  0.8158671 1.0000000
## sample estimates:
##         p 
## 0.8276451

#2a. Lower tailed test for proportions
#Suppose 60% of citizens voted in last election. 
#85 out of 148 people in a telephone survey said that they voted in current election. 
#At 0.5 significance level, can we reject the null hypothesis that the 
#proportion of voters in the population is above 60% this year?
#The null hypothesis is that p ≥ 0.6.

pbar = 85/148          # sample proportion 
p0 = .6                # hypothesized value 
n = 148                # sample size 
z = (pbar-p0)/sqrt(p0*(1-p0)/n) 
z                      # test statistic

## [1] -0.6375983

#Compute p value

pval = pnorm(z, lower.tail=TRUE) 
pval                   # lower tail p−value

## [1] 0.2618676

#As it turns out to be greater than the .05 significance level, 
#we do not reject the null hypothesis that p ≥ 0.6.

#We can also compute the critical value at .05 significance level.

alpha = .05 
z.alpha = qnorm(1-alpha) 
-z.alpha               # critical value (take - z alpha)

## [1] -1.644854

#The test statistic -0.6376 is not less than the critical value of -1.6449. 
#Hence, at .05 significance level, we do not reject the null hypothesis 
#that the proportion of voters in the population is above 60% this year.



#2b. Upper tailed test for proportions
#Suppose that 12% of apples harvested in an orchard last year was rotten. 
#30 out of 214 apples in a harvest sample this year turns out to be rotten. 
#At .05 significance level, can we reject the null hypothesis that 
#the proportion of rotten apples in harvest stays below 12% this year?

#The null hypothesis is that p ≤ 0.12.
pbar = 30/214          # sample proportion 
p0 = .12               # hypothesized value 
n = 214                # sample size 
z = (pbar-p0)/sqrt(p0*(1-p0)/n) 
z                      # test statistic

## [1] 0.908751

#Compute p-value
pval = pnorm(z, lower.tail=FALSE) 
pval                   # upper tail p−value

## [1] 0.1817408

#As it turns out to be greater than the .05 significance level, 
#we do not reject the null hypothesis that p ≤ 0.12.

#We can also compute the critical value at .05 significance level.

alpha = .05 
z.alpha = qnorm(1-alpha) 
z.alpha                # critical value

## [1] 1.644854

#The test statistic 0.90875 is not greater than the critical value of 1.6449. 
#Hence, at .05 significance level, we do not reject the null hypothesis that 
#the proportion of rotten apples in harvest stays below 12% this year.



#2c. Two Tailed Test for proportions
#Suppose a coin toss turns up 12 heads out of 20 trials. At .05 significance level, 
#can one reject the null hypothesis that the coin toss is fair?

#The null hypothesis is that p = 0.5. 

pbar = 12/20           # sample proportion 
p0 = .5                # hypothesized value 
n = 20                 # sample size 
z = (pbar-p0)/sqrt(p0*(1-p0)/n) 
z                      # test statistic

## [1] 0.8944272

# Compute p-value
pval = 2 * pnorm(z, lower.tail=FALSE)  # upper tail 
pval                   # two−tailed p−value

## [1] 0.3710934

#It doubles the upper tail p-value as the sample proportion 
#is greater than the hypothesized value. 
#Since it turns out to be greater than the .05 significance level, 
#we do not reject the null hypothesis that p = 0.5.

#We can compute the critical values at .05 significance level.

alpha = .05 
z.half.alpha = qnorm(1-alpha/2) 
c(-z.half.alpha, z.half.alpha)

## [1] -1.959964  1.959964

#The test statistic 0.89443 lies between the critical values -1.9600 and 1.9600.
#Hence, at .05 significance level, we do not reject the null hypothesis 
#that the coin toss is fair.

Week 5: Correlation and Linear Regression

#CORRELATION#

# scatterplot between Sale Price and Above Ground Living Area

ggplot(ames) +
  aes(x = Gr.Liv.Area, y = SalePrice) +
  geom_point(colour = "#0c4c8a") +
  theme_minimal()

#We note we can remove some outliers from the dataset. 
#In particular take out homes larger than 4000 sqft so we won't use the homes that large

ames2 = ames[-which(ames$Gr.Liv.Area>4000 & ames$SalePrice<800000),]


ggplot(ames2) +
  aes(x = Gr.Liv.Area, y = SalePrice) +
  geom_point(colour = "#0c4c8a") +
  theme_minimal()

#Correlation Coefficient

cor(ames2$SalePrice, ames2$Gr.Liv.Area)

## [1] 0.7194627

# Pearson correlation test(hypothesis test to decide if the value of the 
#population correlation coefficient (rho or ρ) is close to 0 or significantly different from 0
test <- cor.test(ames2$SalePrice, ames2$Gr.Liv.Area)
test

## 
##  Pearson's product-moment correlation
## 
## data:  ames2$SalePrice and ames2$Gr.Liv.Area
## t = 56.006, df = 2923, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.7015122 0.7365008
## sample estimates:
##       cor 
## 0.7194627

#SIMPLE LINEAR REGRESSION#
#Model a linear relationship between the total above ground living space of a home (Gr_Liv_Area) 
#and sale price (Sale_Price). To perform an OLS regression model in R we can use the lm() function:
#model1 <- lm(Sale_Price ~ Gr_Liv_Area, data = ames_train)

model1 <- lm(SalePrice ~ Gr.Liv.Area, data = ames2)

 
#We can also use summary() to get a more detailed report of the model results.
summary(model1)

## 
## Call:
## lm(formula = SalePrice ~ Gr.Liv.Area, data = ames2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -201358  -30336   -1655   23303  330127 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 6773.484   3260.418   2.077   0.0378 *  
## Gr.Liv.Area  116.225      2.075  56.006   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 54570 on 2923 degrees of freedom
## Multiple R-squared:  0.5176, Adjusted R-squared:  0.5175 
## F-statistic:  3137 on 1 and 2923 DF,  p-value: < 2.2e-16

str(model1)

## List of 12
##  $ coefficients : Named num [1:2] 6773 116
##   ..- attr(*, "names")= chr [1:2] "(Intercept)" "Gr.Liv.Area"
##  $ residuals    : Named num [1:2925] -26964 -79768 -130433 -79049 -42697 ...
##   ..- attr(*, "names")= chr [1:2925] "1" "2" "3" "4" ...
##  $ effects      : Named num [1:2925] -9757248 -3056117 -127159 -77680 -39851 ...
##   ..- attr(*, "names")= chr [1:2925] "(Intercept)" "Gr.Liv.Area" "" "" ...
##  $ rank         : int 2
##  $ fitted.values: Named num [1:2925] 197964 218768 289433 215049 272697 ...
##   ..- attr(*, "names")= chr [1:2925] "1" "2" "3" "4" ...
##  $ assign       : int [1:2] 0 1
##  $ qr           :List of 5
##   ..$ qr   : num [1:2925, 1:2] -54.0833 0.0185 0.0185 0.0185 0.0185 ...
##   .. ..- attr(*, "dimnames")=List of 2
##   .. .. ..$ : chr [1:2925] "1" "2" "3" "4" ...
##   .. .. ..$ : chr [1:2] "(Intercept)" "Gr.Liv.Area"
##   .. ..- attr(*, "assign")= int [1:2] 0 1
##   ..$ qraux: num [1:2] 1.02 1.01
##   ..$ pivot: int [1:2] 1 2
##   ..$ tol  : num 1e-07
##   ..$ rank : int 2
##   ..- attr(*, "class")= chr "qr"
##  $ df.residual  : int 2923
##  $ xlevels      : Named list()
##  $ call         : language lm(formula = SalePrice ~ Gr.Liv.Area, data = ames2)
##  $ terms        :Classes 'terms', 'formula'  language SalePrice ~ Gr.Liv.Area
##   .. ..- attr(*, "variables")= language list(SalePrice, Gr.Liv.Area)
##   .. ..- attr(*, "factors")= int [1:2, 1] 0 1
##   .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. ..$ : chr [1:2] "SalePrice" "Gr.Liv.Area"
##   .. .. .. ..$ : chr "Gr.Liv.Area"
##   .. ..- attr(*, "term.labels")= chr "Gr.Liv.Area"
##   .. ..- attr(*, "order")= int 1
##   .. ..- attr(*, "intercept")= int 1
##   .. ..- attr(*, "response")= int 1
##   .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##   .. ..- attr(*, "predvars")= language list(SalePrice, Gr.Liv.Area)
##   .. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "numeric"
##   .. .. ..- attr(*, "names")= chr [1:2] "SalePrice" "Gr.Liv.Area"
##  $ model        :'data.frame':   2925 obs. of  2 variables:
##   ..$ SalePrice  : int [1:2925] 171000 139000 159000 136000 230000 190000 200000 153337 148325 269500 ...
##   ..$ Gr.Liv.Area: int [1:2925] 1645 1824 2432 1792 2288 2544 2634 1224 1229 2787 ...
##   ..- attr(*, "terms")=Classes 'terms', 'formula'  language SalePrice ~ Gr.Liv.Area
##   .. .. ..- attr(*, "variables")= language list(SalePrice, Gr.Liv.Area)
##   .. .. ..- attr(*, "factors")= int [1:2, 1] 0 1
##   .. .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. .. ..$ : chr [1:2] "SalePrice" "Gr.Liv.Area"
##   .. .. .. .. ..$ : chr "Gr.Liv.Area"
##   .. .. ..- attr(*, "term.labels")= chr "Gr.Liv.Area"
##   .. .. ..- attr(*, "order")= int 1
##   .. .. ..- attr(*, "intercept")= int 1
##   .. .. ..- attr(*, "response")= int 1
##   .. .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##   .. .. ..- attr(*, "predvars")= language list(SalePrice, Gr.Liv.Area)
##   .. .. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "numeric"
##   .. .. .. ..- attr(*, "names")= chr [1:2] "SalePrice" "Gr.Liv.Area"
##  - attr(*, "class")= chr "lm"

#You can also get the elements from the following
#coef  <- coefficients(model1)       # coefficients
#resid <- residuals(model1)          # residuals
#pred  <- predict(model1)            # fitted values
#rsq   <- summary(model1)$r.squared  # R-sq for the fit
#se    <- summary(model1)$sigma      # se of the fit


#To get the statistics of the coefficients, you need to use summary:
#stat.coef  <- summary(fit)$coefficients
#coef    <- stat.coef[,1]    # 1st column: coefficients (same as above)
#se.coef <- stat.coef[,2]    # 2nd column: se for each coef
#t.coef  <- stat.coef[,3]    # 3rd column: t-value for each coef
#p.coef  <- stat.coef[,4]    # 4th column: p-value for each coefficient


#Estimate of Error variance
#Typically, these error metrics are computed on a separate validation set or 
#using cross-validation (ML course); 
#however, they can also be computed on the same training data the model was trained on 

sigma(model1)    # RMSE

## [1] 54568.13

sigma(model1)^2  # MSE

## [1] 2977680331

#Note that the RMSE is also reported as the Residual standard error in the output from summary().


#We can construct such (one-at-a-time) confidence intervals for each coefficient using confint(). 
#For example, a 95% confidence intervals for the coefficients in our SLR example can be computed using

confint(model1, level = 0.95)

##                2.5 %     97.5 %
## (Intercept) 380.5340 13166.4335
## Gr.Liv.Area 112.1562   120.2944

#Linear regression plots

plot(model1)

#Fitting New Data (use the following line of code)
#predict(model1, myenewdata)


##########TRANSFORMATION OF RESPONSE############

logSalePrice <-log(ames2$SalePrice)

ggplot(ames2) +
  aes(x = Gr.Liv.Area, y = logSalePrice) +
  geom_point(colour = "#0c4c8a") +
  theme_minimal()

model2 <- lm(logSalePrice ~ Gr.Liv.Area, data = ames2)
summary(model2)

## 
## Call:
## lm(formula = logSalePrice ~ Gr.Liv.Area, data = ames2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.17037 -0.15249  0.03129  0.16804  0.90367 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.113e+01  1.705e-02  652.80   <2e-16 ***
## Gr.Liv.Area 5.939e-04  1.085e-05   54.72   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2854 on 2923 degrees of freedom
## Multiple R-squared:  0.506,  Adjusted R-squared:  0.5058 
## F-statistic:  2994 on 1 and 2923 DF,  p-value: < 2.2e-16

coef  <- coefficients(model2)       # coefficients

finalcoeff = exp(coef)
finalcoeff

##  (Intercept)  Gr.Liv.Area 
## 68360.496528     1.000594

#For 100 square feet increase in above ground living area, exponentiate the coeff*100

exp(.0005939377*100)

## [1] 1.061193

Statistics for Business Analytics Cheat Sheet

Karan Patel

3/2/2021

Read Data

Week 1: Summarizing Data and Sampling

Week 2: Spread of Data, Contingency Tables, and Simple Plots

Week 3: Other Graphical Means and Confidence Intervals

Week 4: Hypothesis Testing - Means and Proportions

Week 5: Correlation and Linear Regression