Pulling Dataset From AmesHousing Dataset

Making a smaller dataset to be synthesized

Ames <- make_ames()
Ames_small <- Ames %>%
  select(Gr_Liv_Area, Central_Air, Kitchen_Qual, Year_Sold, Sale_Price)

Creating synthetic dataset

Ames_small$Year_Sold <- as.factor(Ames_small$Year_Sold)       #Making Year_Sold factors
codebook.syn(Ames_small)$tab                                  #No NA values and no negative values
##       variable   class nmiss perctmiss ndistinct details
## 1  Gr_Liv_Area integer     0         0      1292        
## 2  Central_Air  factor     0         0         2        
## 3 Kitchen_Qual  factor     0         0         5        
## 4    Year_Sold  factor     0         0         5        
## 5   Sale_Price integer     0         0      1032
Ames_syn <- syn(Ames_small)                                   #Using syn command to synthesize dataset
## 
## Synthesis
## -----------
##  Gr_Liv_Area Central_Air Kitchen_Qual Year_Sold Sale_Price
summary(Ames_syn)                                             #Descibes the way it was synthesized
## Synthetic object with one synthesis using methods:
##  Gr_Liv_Area  Central_Air Kitchen_Qual    Year_Sold   Sale_Price 
##     "sample"       "cart"       "cart"       "cart"       "cart" 
## 
##   Gr_Liv_Area   Central_Air    Kitchen_Qual  Year_Sold    Sale_Price    
##  Min.   : 334   N: 218      Excellent: 200   2006:653   Min.   : 12789  
##  1st Qu.:1131   Y:2712      Fair     :  92   2007:694   1st Qu.:130000  
##  Median :1459               Good     :1184   2008:625   Median :160000  
##  Mean   :1504               Poor     :   0   2009:597   Mean   :180925  
##  3rd Qu.:1746               Typical  :1454   2010:361   3rd Qu.:213500  
##  Max.   :5642                                           Max.   :755000
write.syn(Ames_syn, filename = "Ames_syn", filetype = "csv")  #Save the file as csv
## Synthetic data exported as csv file(s).
## Information on synthetic data written to
##   Y:/PDAT 625/Module 3/synthesis_info_Ames_syn.txt
syn <- as.data.frame(Ames_syn$syn)                            #Create data frame from list

Linear Modeling of Synthetic Data

syn_lm <- lm.synds(Sale_Price~.,data=Ames_syn)                #Creating a linear model from lm.synds
summary(syn_lm)                                               #Analyzing linear model
## Fit to synthetic data set with a single synthesis. Inference to coefficients
## and standard errors that would be obtained from the original data.
## 
## Call:
## lm.synds(formula = Sale_Price ~ ., data = Ames_syn)
## 
## Combined estimates:
##                      xpct(Beta) xpct(se.Beta)  xpct(z) Pr(>|xpct(z)|)    
## (Intercept)          1.3084e+05    5.8514e+03  22.3600        < 2e-16 ***
## Gr_Liv_Area          8.1899e+01    1.7547e+00  46.6738        < 2e-16 ***
## Central_AirY         3.5457e+04    3.2620e+03  10.8695        < 2e-16 ***
## Kitchen_QualFair    -1.3665e+05    5.8825e+03 -23.2304        < 2e-16 ***
## Kitchen_QualGood    -8.5855e+04    3.4431e+03 -24.9352        < 2e-16 ***
## Kitchen_QualTypical -1.3081e+05    3.5670e+03 -36.6728        < 2e-16 ***
## Year_Sold2007        1.7159e+03    2.4026e+03   0.7142        0.47510    
## Year_Sold2008       -2.2244e+03    2.4620e+03  -0.9035        0.36627    
## Year_Sold2009       -5.2131e+03    2.4916e+03  -2.0923        0.03641 *  
## Year_Sold2010       -6.9107e+03    2.8902e+03  -2.3911        0.01680 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Comparing original dataset to synthetic dataset using compare command

mycols <- c("darkmagenta", "turquoise") 
compare(Ames_syn, Ames_small, nrow = 2, ncol = 3, cols = mycols)
## 
## Comparing percentages observed with synthetic

## 
## Selected utility measures:
##                  pMSE   S_pMSE df
## Gr_Liv_Area  0.000052 0.614340  4
## Central_Air  0.000054 2.515909  1
## Kitchen_Qual 0.000206 2.418929  4
## Year_Sold    0.000140 1.639817  4
## Sale_Price   0.000062 0.729416  4
compare(Ames_syn, Ames_small, var = "Kitchen_Qual", cols = mycols)
## 
## Comparing percentages observed with synthetic

## 
## Selected utility measures:
##                  pMSE   S_pMSE df
## Kitchen_Qual 0.000206 2.418929  4