Pulling Dataset From AmesHousing Dataset
Making a smaller dataset to be synthesized
Ames <- make_ames()
Ames_small <- Ames %>%
select(Gr_Liv_Area, Central_Air, Kitchen_Qual, Year_Sold, Sale_Price)
Creating synthetic dataset
Ames_small$Year_Sold <- as.factor(Ames_small$Year_Sold) #Making Year_Sold factors
codebook.syn(Ames_small)$tab #No NA values and no negative values
## variable class nmiss perctmiss ndistinct details
## 1 Gr_Liv_Area integer 0 0 1292
## 2 Central_Air factor 0 0 2
## 3 Kitchen_Qual factor 0 0 5
## 4 Year_Sold factor 0 0 5
## 5 Sale_Price integer 0 0 1032
Ames_syn <- syn(Ames_small) #Using syn command to synthesize dataset
##
## Synthesis
## -----------
## Gr_Liv_Area Central_Air Kitchen_Qual Year_Sold Sale_Price
summary(Ames_syn) #Descibes the way it was synthesized
## Synthetic object with one synthesis using methods:
## Gr_Liv_Area Central_Air Kitchen_Qual Year_Sold Sale_Price
## "sample" "cart" "cart" "cart" "cart"
##
## Gr_Liv_Area Central_Air Kitchen_Qual Year_Sold Sale_Price
## Min. : 334 N: 218 Excellent: 200 2006:653 Min. : 12789
## 1st Qu.:1131 Y:2712 Fair : 92 2007:694 1st Qu.:130000
## Median :1459 Good :1184 2008:625 Median :160000
## Mean :1504 Poor : 0 2009:597 Mean :180925
## 3rd Qu.:1746 Typical :1454 2010:361 3rd Qu.:213500
## Max. :5642 Max. :755000
write.syn(Ames_syn, filename = "Ames_syn", filetype = "csv") #Save the file as csv
## Synthetic data exported as csv file(s).
## Information on synthetic data written to
## Y:/PDAT 625/Module 3/synthesis_info_Ames_syn.txt
syn <- as.data.frame(Ames_syn$syn) #Create data frame from list
Linear Modeling of Synthetic Data
syn_lm <- lm.synds(Sale_Price~.,data=Ames_syn) #Creating a linear model from lm.synds
summary(syn_lm) #Analyzing linear model
## Fit to synthetic data set with a single synthesis. Inference to coefficients
## and standard errors that would be obtained from the original data.
##
## Call:
## lm.synds(formula = Sale_Price ~ ., data = Ames_syn)
##
## Combined estimates:
## xpct(Beta) xpct(se.Beta) xpct(z) Pr(>|xpct(z)|)
## (Intercept) 1.3084e+05 5.8514e+03 22.3600 < 2e-16 ***
## Gr_Liv_Area 8.1899e+01 1.7547e+00 46.6738 < 2e-16 ***
## Central_AirY 3.5457e+04 3.2620e+03 10.8695 < 2e-16 ***
## Kitchen_QualFair -1.3665e+05 5.8825e+03 -23.2304 < 2e-16 ***
## Kitchen_QualGood -8.5855e+04 3.4431e+03 -24.9352 < 2e-16 ***
## Kitchen_QualTypical -1.3081e+05 3.5670e+03 -36.6728 < 2e-16 ***
## Year_Sold2007 1.7159e+03 2.4026e+03 0.7142 0.47510
## Year_Sold2008 -2.2244e+03 2.4620e+03 -0.9035 0.36627
## Year_Sold2009 -5.2131e+03 2.4916e+03 -2.0923 0.03641 *
## Year_Sold2010 -6.9107e+03 2.8902e+03 -2.3911 0.01680 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Comparing original dataset to synthetic dataset using compare
command
mycols <- c("darkmagenta", "turquoise")
compare(Ames_syn, Ames_small, nrow = 2, ncol = 3, cols = mycols)
##
## Comparing percentages observed with synthetic

##
## Selected utility measures:
## pMSE S_pMSE df
## Gr_Liv_Area 0.000052 0.614340 4
## Central_Air 0.000054 2.515909 1
## Kitchen_Qual 0.000206 2.418929 4
## Year_Sold 0.000140 1.639817 4
## Sale_Price 0.000062 0.729416 4
compare(Ames_syn, Ames_small, var = "Kitchen_Qual", cols = mycols)
##
## Comparing percentages observed with synthetic

##
## Selected utility measures:
## pMSE S_pMSE df
## Kitchen_Qual 0.000206 2.418929 4