library(haven)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(DescTools)
## Warning: package 'DescTools' was built under R version 4.4.2
UZA_data <- read_sav("./mid-term dataset/UZA.sav")
str(UZA_data)
## tibble [157 × 59] (S3: tbl_df/tbl/data.frame)
##  $ ua_id       : num [1:157] 766 928 1171 1495 1927 ...
##   ..- attr(*, "format.spss")= chr "F12.0"
##   ..- attr(*, "display_width")= int 12
##  $ urbanarea   : chr [1:157] "Akron, OH" "Albany, NY" "Albuquerque, NM" "Allentown, PA-NJ" ...
##   ..- attr(*, "format.spss")= chr "A19"
##   ..- attr(*, "display_width")= int 19
##  $ region      : dbl+lbl [1:157] 2, 1, 4, 1, 3, 2, 4, 2, 3, 3, 3, 3, 4, 3, 3, 3, 3, 4, ...
##    ..@ format.spss: chr "F8.0"
##    ..@ labels     : Named num [1:4] 1 2 3 4
##    .. ..- attr(*, "names")= chr [1:4] "Northeast" "Midwest" "South" "West"
##  $ pop000      : num [1:157] 582 611 747 633 203 ...
##   ..- attr(*, "format.spss")= chr "F8.2"
##  $ lnpop000    : num [1:157] 6.37 6.42 6.62 6.45 5.31 ...
##   ..- attr(*, "format.spss")= chr "F8.2"
##  $ inc000      : num [1:157] 24.3 29.6 25.7 26.6 22.7 ...
##   ..- attr(*, "format.spss")= chr "F8.2"
##  $ lninc000    : num [1:157] 3.19 3.39 3.24 3.28 3.12 ...
##   ..- attr(*, "format.spss")= chr "F8.2"
##  $ hhsize      : num [1:157] 2.38 2.33 2.53 2.55 2.63 ...
##   ..- attr(*, "format.spss")= chr "F8.4"
##  $ lnhhsize    : num [1:157] 0.868 0.847 0.93 0.934 0.967 ...
##   ..- attr(*, "format.spss")= chr "F8.2"
##  $ veh         : num [1:157] 0.421 0.4 0.391 0.391 0.41 ...
##   ..- attr(*, "format.spss")= chr "F8.4"
##  $ lnveh       : num [1:157] -0.866 -0.917 -0.94 -0.94 -0.891 ...
##   ..- attr(*, "format.spss")= chr "F8.2"
##  $ area        : num [1:157] 379 426 471 323 163 ...
##   ..- attr(*, "format.spss")= chr "F8.2"
##  $ lnarea      : num [1:157] 5.94 6.05 6.15 5.78 5.09 ...
##   ..- attr(*, "format.spss")= chr "F8.2"
##  $ popden      : num [1:157] 1536 1435 1588 1960 1242 ...
##   ..- attr(*, "format.spss")= chr "F8.2"
##  $ lnpopden    : num [1:157] 7.34 7.27 7.37 7.58 7.12 ...
##   ..- attr(*, "format.spss")= chr "F8.2"
##  $ lm          : num [1:157] 2.84 3.52 2.92 2.48 4.81 ...
##   ..- attr(*, "format.spss")= chr "F8.2"
##  $ lnlm        : num [1:157] 1.043 1.258 1.072 0.909 1.571 ...
##   ..- attr(*, "format.spss")= chr "F8.2"
##  $ flm         : num [1:157] 0.751 1.011 0.45 0.656 0.937 ...
##   ..- attr(*, "format.spss")= chr "F8.2"
##  $ lnflm       : num [1:157] -0.2862 0.0114 -0.7991 -0.4219 -0.0646 ...
##   ..- attr(*, "format.spss")= chr "F8.2"
##  $ olm         : num [1:157] 2.09 2.51 2.47 1.83 3.88 ...
##   ..- attr(*, "format.spss")= chr "F8.2"
##  $ lnolm       : num [1:157] 0.735 0.919 0.905 0.603 1.355 ...
##   ..- attr(*, "format.spss")= chr "F8.2"
##  $ rtden       : num [1:157] 1.88 4.08 1.836 1.452 0.686 ...
##   ..- attr(*, "format.spss")= chr "F8.2"
##  $ lnrtden     : num [1:157] 0.631 1.406 0.608 0.373 -0.376 ...
##   ..- attr(*, "format.spss")= chr "F8.4"
##  $ tfreq       : num [1:157] 5431 4098 6448 5378 6054 ...
##   ..- attr(*, "format.spss")= chr "F8.2"
##  $ lntfreq     : num [1:157] 8.6 8.32 8.77 8.59 8.71 ...
##   ..- attr(*, "format.spss")= chr "F8.4"
##   ..- attr(*, "display_width")= int 9
##  $ hrt         : num [1:157] 0 0 0 0 0 0 0 0 0 96.1 ...
##   ..- attr(*, "format.spss")= chr "F8.4"
##  $ lnhrt       : num [1:157] 0 0 0 0 0 ...
##   ..- attr(*, "format.spss")= chr "F8.4"
##  $ lrt         : num [1:157] 0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "format.spss")= chr "F8.4"
##  $ lnlrt       : num [1:157] 0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "format.spss")= chr "F8.4"
##  $ tpm         : num [1:157] 40.05 83.52 123.16 44.45 7.79 ...
##   ..- attr(*, "format.spss")= chr "F8.2"
##  $ lntpm       : num [1:157] 3.69 4.43 4.81 3.79 2.05 ...
##   ..- attr(*, "format.spss")= chr "F8.2"
##  $ fuel        : num [1:157] 2.72 2.9 2.7 2.77 2.68 ...
##   ..- attr(*, "format.spss")= chr "F8.4"
##  $ lnfuel      : num [1:157] 1.001 1.065 0.994 1.02 0.987 ...
##   ..- attr(*, "format.spss")= chr "F8.2"
##  $ vmt         : num [1:157] 25.3 25.9 20.7 21.4 21.5 ...
##   ..- attr(*, "format.spss")= chr "F8.2"
##  $ lnvmt       : num [1:157] 3.23 3.25 3.03 3.06 3.07 ...
##   ..- attr(*, "format.spss")= chr "F8.2"
##  $ empden      : num [1:157] 660 795 712 871 633 ...
##   ..- attr(*, "format.spss")= chr "F8.2"
##  $ pct1500     : num [1:157] 30.2 28 13.1 19.6 19.5 ...
##   ..- attr(*, "format.spss")= chr "F8.4"
##  $ pct12500    : num [1:157] 0.565 10.226 2.975 13.55 0.538 ...
##   ..- attr(*, "format.spss")= chr "F8.4"
##  $ urbden      : num [1:157] 2495 3153 4053 3527 2996 ...
##   ..- attr(*, "format.spss")= chr "F8.2"
##  $ denfac      : num [1:157] 84.8 96.5 103.3 108.7 89.7 ...
##   ..- attr(*, "format.spss")= chr "F8.2"
##  $ mixfac      : num [1:157] 102.7 112.8 71.8 134.5 96 ...
##   ..- attr(*, "format.spss")= chr "F8.2"
##  $ cenfac      : num [1:157] 92.7 116.3 95 105.3 71.7 ...
##   ..- attr(*, "format.spss")= chr "F8.2"
##  $ strfac      : num [1:157] 98.1 84.9 101.5 149.7 124 ...
##   ..- attr(*, "format.spss")= chr "F8.2"
##  $ compact     : num [1:157] 90.1 104.8 84.3 145.9 102.3 ...
##   ..- attr(*, "format.spss")= chr "F8.2"
##  $ lncompact   : num [1:157] 4.5 4.65 4.43 4.98 4.63 ...
##   ..- attr(*, "format.spss")= chr "F8.2"
##  $ delay       : num [1:157] 37.3 33.4 27.4 27 15.2 ...
##   ..- attr(*, "format.spss")= chr "F8.2"
##  $ lndelay     : num [1:157] 3.62 3.51 3.31 3.3 2.72 ...
##   ..- attr(*, "format.spss")= chr "F8.2"
##  $ tti         : num [1:157] 1.12 1.17 1.16 1.17 1.08 ...
##   ..- attr(*, "format.spss")= chr "F8.4"
##  $ lntti       : num [1:157] 0.1133 0.157 0.1484 0.157 0.0726 ...
##   ..- attr(*, "format.spss")= chr "F8.2"
##  $ jobpopwgtwgt: num [1:157] 0.488 0.459 0.376 0.506 0.432 ...
##   ..- attr(*, "format.spss")= chr "F12.4"
##   ..- attr(*, "display_width")= int 12
##  $ entropywgt  : num [1:157] 0.541 0.595 0.552 0.615 0.571 ...
##   ..- attr(*, "format.spss")= chr "F12.4"
##   ..- attr(*, "display_width")= int 12
##  $ varpop      : num [1:157] 0.74 0.983 0.534 1.135 0.568 ...
##   ..- attr(*, "format.spss")= chr "F8.2"
##   ..- attr(*, "display_width")= int 10
##  $ varemp      : num [1:157] 2.07 2.29 2.4 1.76 1.27 ...
##   ..- attr(*, "format.spss")= chr "F8.2"
##   ..- attr(*, "display_width")= int 10
##  $ pctempcen   : num [1:157] 12.51 17.74 16.38 7.14 11.24 ...
##   ..- attr(*, "format.spss")= chr "F8.2"
##   ..- attr(*, "display_width")= int 11
##  $ pctpopcen   : num [1:157] 1.4 2.495 1.742 2.708 0.309 ...
##   ..- attr(*, "format.spss")= chr "F8.2"
##   ..- attr(*, "display_width")= int 11
##  $ avgblk      : num [1:157] 0.0301 0.0354 0.0298 0.0164 0.0292 ...
##   ..- attr(*, "format.spss")= chr "F12.4"
##   ..- attr(*, "display_width")= int 12
##  $ pctsmlblk   : num [1:157] 59.5 61.3 58 77.8 67.9 ...
##   ..- attr(*, "format.spss")= chr "F12.4"
##   ..- attr(*, "display_width")= int 12
##  $ intden      : num [1:157] 44.7 41.3 51.5 80.3 42.6 ...
##   ..- attr(*, "format.spss")= chr "F8.2"
##   ..- attr(*, "display_width")= int 10
##  $ pct4way     : num [1:157] 27.5 19 28.9 34.9 45.5 ...
##   ..- attr(*, "format.spss")= chr "F8.2"
##   ..- attr(*, "display_width")= int 10
class(UZA_data$region)
## [1] "haven_labelled" "vctrs_vctr"     "double"
#variable type is a double, meaning region data type is ordinal

#Collapse this ratio variable into an ordinal variable of three levels

UZA_data <- UZA_data %>%
  mutate(popden_ordinal = case_when(
    popden < 1500 ~ "LOW",
    popden >= 1500 & popden <= 2000 ~ "MEDIUM",
    popden > 2000 ~ "HIGH"
    ))

#Make a frequency, percentage, and cumulative percentage table of the ordinal variable

freq_table <- table(UZA_data$popden_ordinal)
percent_table <- prop.table(freq_table) * 100
cumulative_percent_table <- cumsum(percent_table)

final_table <- data.frame(Frequency = freq_table,
                          Percentage = percent_table,
                          Cumulative_Percentage = cumulative_percent_table)
print(final_table)
##        Frequency.Var1 Frequency.Freq Percentage.Var1 Percentage.Freq
## HIGH             HIGH             47            HIGH        29.93631
## LOW               LOW             63             LOW        40.12739
## MEDIUM         MEDIUM             47          MEDIUM        29.93631
##        Cumulative_Percentage
## HIGH                29.93631
## LOW                 70.06369
## MEDIUM             100.00000

Represent the following variable in UZA.sav graphically

#region (Bar Plot of Frequency)
ggplot(UZA_data, aes(x = region)) + geom_bar() + ggtitle("Region Bar Plot")

#popden (Histogram with 6 bins)
ggplot(UZA_data, aes(x = popden)) + geom_histogram(bins = 6) + ggtitle("Histogram of Popden")

#popden_ordinal (Pie Chart)
ggplot(UZA_data, aes(x = "", fill = popden_ordinal)) + geom_bar(width = 1) + coord_polar(theta = "y") + ggtitle("Pie Chart of Popden_ordinal")

#Compare the means for each variable (popden, inc000) by census regions. On Average, which region is higher in density, and which region is higher in income?

UZA_data %>%
  group_by(region) %>%
  summarize(mean_popden = mean(popden, na.rm = TRUE), 
            mean_inc00 = mean(inc000, na.rm = TRUE))
## # A tibble: 4 × 3
##   region        mean_popden mean_inc00
##   <dbl+lbl>           <dbl>      <dbl>
## 1 1 [Northeast]       1777.       29.8
## 2 2 [Midwest]         1675.       26.3
## 3 3 [South]           1533.       25.8
## 4 4 [West]            2783.       27.7
#On Average, Region 4 is higher in density and Region 1 is higher in income

#Draw a random sample of size 200 from the dataset HTS.household.10regions.csv, and perform a one-sample test to determine whether the population mean of hhsize is different from 2.3

HTS_data <- read.csv("./mid-term dataset/HTS.household.10regions.csv")
set.seed(1234)
#A one-sample t-test does not require you to check for homoscedasticity
rand_df_200 <- HTS_data[sample(nrow(HTS_data), size = 200), ]
t_test_result_200 <- t.test(rand_df_200$hhsize, mu = 2.3)
#With a sample size as large as 200 you do not need to check for normality
t_test_result_200
## 
##  One Sample t-test
## 
## data:  rand_df_200$hhsize
## t = 2.1381, df = 199, p-value = 0.03372
## alternative hypothesis: true mean is not equal to 2.3
## 95 percent confidence interval:
##  2.31438 2.65562
## sample estimates:
## mean of x 
##     2.485
#Based on the t-test results, since the p-value is less than 0.05, you reject the null hypothesis, and conclude that the mean is 2.3

#Draw a random sample of size 50

set.seed(1234)
#Again, no need to test for homoscedasticity in a one-sample t-test
rand_df_50 <- HTS_data[sample(nrow(HTS_data), size = 50), ]
t_test_result_50 <- t.test(rand_df_50$hhsize, mu = 2.3)
#With a smaller sample size, you need to test for normality
shapiro.test(rand_df_50$hhsize)
## 
##  Shapiro-Wilk normality test
## 
## data:  rand_df_50$hhsize
## W = 0.87867, p-value = 0.000102
t_test_result_50
## 
##  One Sample t-test
## 
## data:  rand_df_50$hhsize
## t = 0.90741, df = 49, p-value = 0.3686
## alternative hypothesis: true mean is not equal to 2.3
## 95 percent confidence interval:
##  2.129951 2.750049
## sample estimates:
## mean of x 
##      2.44
#For the sample size of 50, since the p-value is greater than .05, your conclusion has changed and you can not reject the null hypothesis.

#Create a Bicariate Table for htype and income_cat

bivariate_table <- table(HTS_data$htype, HTS_data$income_cat)
bivariate_table
##    
##        1    2    3
##   0   74   93   14
##   1 1739 4153 4157
##   2  321  407  140
##   3  576  585  163
chi_test_result <-  chisq.test(bivariate_table)
chi_test_result
## 
##  Pearson's Chi-squared test
## 
## data:  bivariate_table
## X-squared = 934.04, df = 6, p-value < 2.2e-16
#The P-value is 2.2e-16, which suggests a relationship between htype and income_cat
lambda_row <- Lambda(bivariate_table, direction = "row")
lambda_column <- Lambda(bivariate_table, direction = "column")
lambda_row
## [1] 0
lambda_column
## [1] 0.0005567929
#Since the values are so close to 0, it suggests no relationship between the two variables