library(haven)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(DescTools)
## Warning: package 'DescTools' was built under R version 4.4.2
UZA_data <- read_sav("./mid-term dataset/UZA.sav")
str(UZA_data)
## tibble [157 × 59] (S3: tbl_df/tbl/data.frame)
## $ ua_id : num [1:157] 766 928 1171 1495 1927 ...
## ..- attr(*, "format.spss")= chr "F12.0"
## ..- attr(*, "display_width")= int 12
## $ urbanarea : chr [1:157] "Akron, OH" "Albany, NY" "Albuquerque, NM" "Allentown, PA-NJ" ...
## ..- attr(*, "format.spss")= chr "A19"
## ..- attr(*, "display_width")= int 19
## $ region : dbl+lbl [1:157] 2, 1, 4, 1, 3, 2, 4, 2, 3, 3, 3, 3, 4, 3, 3, 3, 3, 4, ...
## ..@ format.spss: chr "F8.0"
## ..@ labels : Named num [1:4] 1 2 3 4
## .. ..- attr(*, "names")= chr [1:4] "Northeast" "Midwest" "South" "West"
## $ pop000 : num [1:157] 582 611 747 633 203 ...
## ..- attr(*, "format.spss")= chr "F8.2"
## $ lnpop000 : num [1:157] 6.37 6.42 6.62 6.45 5.31 ...
## ..- attr(*, "format.spss")= chr "F8.2"
## $ inc000 : num [1:157] 24.3 29.6 25.7 26.6 22.7 ...
## ..- attr(*, "format.spss")= chr "F8.2"
## $ lninc000 : num [1:157] 3.19 3.39 3.24 3.28 3.12 ...
## ..- attr(*, "format.spss")= chr "F8.2"
## $ hhsize : num [1:157] 2.38 2.33 2.53 2.55 2.63 ...
## ..- attr(*, "format.spss")= chr "F8.4"
## $ lnhhsize : num [1:157] 0.868 0.847 0.93 0.934 0.967 ...
## ..- attr(*, "format.spss")= chr "F8.2"
## $ veh : num [1:157] 0.421 0.4 0.391 0.391 0.41 ...
## ..- attr(*, "format.spss")= chr "F8.4"
## $ lnveh : num [1:157] -0.866 -0.917 -0.94 -0.94 -0.891 ...
## ..- attr(*, "format.spss")= chr "F8.2"
## $ area : num [1:157] 379 426 471 323 163 ...
## ..- attr(*, "format.spss")= chr "F8.2"
## $ lnarea : num [1:157] 5.94 6.05 6.15 5.78 5.09 ...
## ..- attr(*, "format.spss")= chr "F8.2"
## $ popden : num [1:157] 1536 1435 1588 1960 1242 ...
## ..- attr(*, "format.spss")= chr "F8.2"
## $ lnpopden : num [1:157] 7.34 7.27 7.37 7.58 7.12 ...
## ..- attr(*, "format.spss")= chr "F8.2"
## $ lm : num [1:157] 2.84 3.52 2.92 2.48 4.81 ...
## ..- attr(*, "format.spss")= chr "F8.2"
## $ lnlm : num [1:157] 1.043 1.258 1.072 0.909 1.571 ...
## ..- attr(*, "format.spss")= chr "F8.2"
## $ flm : num [1:157] 0.751 1.011 0.45 0.656 0.937 ...
## ..- attr(*, "format.spss")= chr "F8.2"
## $ lnflm : num [1:157] -0.2862 0.0114 -0.7991 -0.4219 -0.0646 ...
## ..- attr(*, "format.spss")= chr "F8.2"
## $ olm : num [1:157] 2.09 2.51 2.47 1.83 3.88 ...
## ..- attr(*, "format.spss")= chr "F8.2"
## $ lnolm : num [1:157] 0.735 0.919 0.905 0.603 1.355 ...
## ..- attr(*, "format.spss")= chr "F8.2"
## $ rtden : num [1:157] 1.88 4.08 1.836 1.452 0.686 ...
## ..- attr(*, "format.spss")= chr "F8.2"
## $ lnrtden : num [1:157] 0.631 1.406 0.608 0.373 -0.376 ...
## ..- attr(*, "format.spss")= chr "F8.4"
## $ tfreq : num [1:157] 5431 4098 6448 5378 6054 ...
## ..- attr(*, "format.spss")= chr "F8.2"
## $ lntfreq : num [1:157] 8.6 8.32 8.77 8.59 8.71 ...
## ..- attr(*, "format.spss")= chr "F8.4"
## ..- attr(*, "display_width")= int 9
## $ hrt : num [1:157] 0 0 0 0 0 0 0 0 0 96.1 ...
## ..- attr(*, "format.spss")= chr "F8.4"
## $ lnhrt : num [1:157] 0 0 0 0 0 ...
## ..- attr(*, "format.spss")= chr "F8.4"
## $ lrt : num [1:157] 0 0 0 0 0 0 0 0 0 0 ...
## ..- attr(*, "format.spss")= chr "F8.4"
## $ lnlrt : num [1:157] 0 0 0 0 0 0 0 0 0 0 ...
## ..- attr(*, "format.spss")= chr "F8.4"
## $ tpm : num [1:157] 40.05 83.52 123.16 44.45 7.79 ...
## ..- attr(*, "format.spss")= chr "F8.2"
## $ lntpm : num [1:157] 3.69 4.43 4.81 3.79 2.05 ...
## ..- attr(*, "format.spss")= chr "F8.2"
## $ fuel : num [1:157] 2.72 2.9 2.7 2.77 2.68 ...
## ..- attr(*, "format.spss")= chr "F8.4"
## $ lnfuel : num [1:157] 1.001 1.065 0.994 1.02 0.987 ...
## ..- attr(*, "format.spss")= chr "F8.2"
## $ vmt : num [1:157] 25.3 25.9 20.7 21.4 21.5 ...
## ..- attr(*, "format.spss")= chr "F8.2"
## $ lnvmt : num [1:157] 3.23 3.25 3.03 3.06 3.07 ...
## ..- attr(*, "format.spss")= chr "F8.2"
## $ empden : num [1:157] 660 795 712 871 633 ...
## ..- attr(*, "format.spss")= chr "F8.2"
## $ pct1500 : num [1:157] 30.2 28 13.1 19.6 19.5 ...
## ..- attr(*, "format.spss")= chr "F8.4"
## $ pct12500 : num [1:157] 0.565 10.226 2.975 13.55 0.538 ...
## ..- attr(*, "format.spss")= chr "F8.4"
## $ urbden : num [1:157] 2495 3153 4053 3527 2996 ...
## ..- attr(*, "format.spss")= chr "F8.2"
## $ denfac : num [1:157] 84.8 96.5 103.3 108.7 89.7 ...
## ..- attr(*, "format.spss")= chr "F8.2"
## $ mixfac : num [1:157] 102.7 112.8 71.8 134.5 96 ...
## ..- attr(*, "format.spss")= chr "F8.2"
## $ cenfac : num [1:157] 92.7 116.3 95 105.3 71.7 ...
## ..- attr(*, "format.spss")= chr "F8.2"
## $ strfac : num [1:157] 98.1 84.9 101.5 149.7 124 ...
## ..- attr(*, "format.spss")= chr "F8.2"
## $ compact : num [1:157] 90.1 104.8 84.3 145.9 102.3 ...
## ..- attr(*, "format.spss")= chr "F8.2"
## $ lncompact : num [1:157] 4.5 4.65 4.43 4.98 4.63 ...
## ..- attr(*, "format.spss")= chr "F8.2"
## $ delay : num [1:157] 37.3 33.4 27.4 27 15.2 ...
## ..- attr(*, "format.spss")= chr "F8.2"
## $ lndelay : num [1:157] 3.62 3.51 3.31 3.3 2.72 ...
## ..- attr(*, "format.spss")= chr "F8.2"
## $ tti : num [1:157] 1.12 1.17 1.16 1.17 1.08 ...
## ..- attr(*, "format.spss")= chr "F8.4"
## $ lntti : num [1:157] 0.1133 0.157 0.1484 0.157 0.0726 ...
## ..- attr(*, "format.spss")= chr "F8.2"
## $ jobpopwgtwgt: num [1:157] 0.488 0.459 0.376 0.506 0.432 ...
## ..- attr(*, "format.spss")= chr "F12.4"
## ..- attr(*, "display_width")= int 12
## $ entropywgt : num [1:157] 0.541 0.595 0.552 0.615 0.571 ...
## ..- attr(*, "format.spss")= chr "F12.4"
## ..- attr(*, "display_width")= int 12
## $ varpop : num [1:157] 0.74 0.983 0.534 1.135 0.568 ...
## ..- attr(*, "format.spss")= chr "F8.2"
## ..- attr(*, "display_width")= int 10
## $ varemp : num [1:157] 2.07 2.29 2.4 1.76 1.27 ...
## ..- attr(*, "format.spss")= chr "F8.2"
## ..- attr(*, "display_width")= int 10
## $ pctempcen : num [1:157] 12.51 17.74 16.38 7.14 11.24 ...
## ..- attr(*, "format.spss")= chr "F8.2"
## ..- attr(*, "display_width")= int 11
## $ pctpopcen : num [1:157] 1.4 2.495 1.742 2.708 0.309 ...
## ..- attr(*, "format.spss")= chr "F8.2"
## ..- attr(*, "display_width")= int 11
## $ avgblk : num [1:157] 0.0301 0.0354 0.0298 0.0164 0.0292 ...
## ..- attr(*, "format.spss")= chr "F12.4"
## ..- attr(*, "display_width")= int 12
## $ pctsmlblk : num [1:157] 59.5 61.3 58 77.8 67.9 ...
## ..- attr(*, "format.spss")= chr "F12.4"
## ..- attr(*, "display_width")= int 12
## $ intden : num [1:157] 44.7 41.3 51.5 80.3 42.6 ...
## ..- attr(*, "format.spss")= chr "F8.2"
## ..- attr(*, "display_width")= int 10
## $ pct4way : num [1:157] 27.5 19 28.9 34.9 45.5 ...
## ..- attr(*, "format.spss")= chr "F8.2"
## ..- attr(*, "display_width")= int 10
class(UZA_data$region)
## [1] "haven_labelled" "vctrs_vctr" "double"
#variable type is a double, meaning region data type is ordinal
#Collapse this ratio variable into an ordinal variable of three levels
UZA_data <- UZA_data %>%
mutate(popden_ordinal = case_when(
popden < 1500 ~ "LOW",
popden >= 1500 & popden <= 2000 ~ "MEDIUM",
popden > 2000 ~ "HIGH"
))
#Make a frequency, percentage, and cumulative percentage table of the ordinal variable
freq_table <- table(UZA_data$popden_ordinal)
percent_table <- prop.table(freq_table) * 100
cumulative_percent_table <- cumsum(percent_table)
final_table <- data.frame(Frequency = freq_table,
Percentage = percent_table,
Cumulative_Percentage = cumulative_percent_table)
print(final_table)
## Frequency.Var1 Frequency.Freq Percentage.Var1 Percentage.Freq
## HIGH HIGH 47 HIGH 29.93631
## LOW LOW 63 LOW 40.12739
## MEDIUM MEDIUM 47 MEDIUM 29.93631
## Cumulative_Percentage
## HIGH 29.93631
## LOW 70.06369
## MEDIUM 100.00000
Represent the following variable in UZA.sav graphically
#region (Bar Plot of Frequency)
ggplot(UZA_data, aes(x = region)) + geom_bar() + ggtitle("Region Bar Plot")
#popden (Histogram with 6 bins)
ggplot(UZA_data, aes(x = popden)) + geom_histogram(bins = 6) + ggtitle("Histogram of Popden")
#popden_ordinal (Pie Chart)
ggplot(UZA_data, aes(x = "", fill = popden_ordinal)) + geom_bar(width = 1) + coord_polar(theta = "y") + ggtitle("Pie Chart of Popden_ordinal")
#Compare the means for each variable (popden, inc000) by census regions. On Average, which region is higher in density, and which region is higher in income?
UZA_data %>%
group_by(region) %>%
summarize(mean_popden = mean(popden, na.rm = TRUE),
mean_inc00 = mean(inc000, na.rm = TRUE))
## # A tibble: 4 × 3
## region mean_popden mean_inc00
## <dbl+lbl> <dbl> <dbl>
## 1 1 [Northeast] 1777. 29.8
## 2 2 [Midwest] 1675. 26.3
## 3 3 [South] 1533. 25.8
## 4 4 [West] 2783. 27.7
#On Average, Region 4 is higher in density and Region 1 is higher in income
#Draw a random sample of size 200 from the dataset HTS.household.10regions.csv, and perform a one-sample test to determine whether the population mean of hhsize is different from 2.3
HTS_data <- read.csv("./mid-term dataset/HTS.household.10regions.csv")
set.seed(1234)
#A one-sample t-test does not require you to check for homoscedasticity
rand_df_200 <- HTS_data[sample(nrow(HTS_data), size = 200), ]
t_test_result_200 <- t.test(rand_df_200$hhsize, mu = 2.3)
#With a sample size as large as 200 you do not need to check for normality
t_test_result_200
##
## One Sample t-test
##
## data: rand_df_200$hhsize
## t = 2.1381, df = 199, p-value = 0.03372
## alternative hypothesis: true mean is not equal to 2.3
## 95 percent confidence interval:
## 2.31438 2.65562
## sample estimates:
## mean of x
## 2.485
#Based on the t-test results, since the p-value is less than 0.05, you reject the null hypothesis, and conclude that the mean is 2.3
#Draw a random sample of size 50
set.seed(1234)
#Again, no need to test for homoscedasticity in a one-sample t-test
rand_df_50 <- HTS_data[sample(nrow(HTS_data), size = 50), ]
t_test_result_50 <- t.test(rand_df_50$hhsize, mu = 2.3)
#With a smaller sample size, you need to test for normality
shapiro.test(rand_df_50$hhsize)
##
## Shapiro-Wilk normality test
##
## data: rand_df_50$hhsize
## W = 0.87867, p-value = 0.000102
t_test_result_50
##
## One Sample t-test
##
## data: rand_df_50$hhsize
## t = 0.90741, df = 49, p-value = 0.3686
## alternative hypothesis: true mean is not equal to 2.3
## 95 percent confidence interval:
## 2.129951 2.750049
## sample estimates:
## mean of x
## 2.44
#For the sample size of 50, since the p-value is greater than .05, your conclusion has changed and you can not reject the null hypothesis.
#Create a Bicariate Table for htype and income_cat
bivariate_table <- table(HTS_data$htype, HTS_data$income_cat)
bivariate_table
##
## 1 2 3
## 0 74 93 14
## 1 1739 4153 4157
## 2 321 407 140
## 3 576 585 163
chi_test_result <- chisq.test(bivariate_table)
chi_test_result
##
## Pearson's Chi-squared test
##
## data: bivariate_table
## X-squared = 934.04, df = 6, p-value < 2.2e-16
#The P-value is 2.2e-16, which suggests a relationship between htype and income_cat
lambda_row <- Lambda(bivariate_table, direction = "row")
lambda_column <- Lambda(bivariate_table, direction = "column")
lambda_row
## [1] 0
lambda_column
## [1] 0.0005567929
#Since the values are so close to 0, it suggests no relationship between the two variables