For the first assignment, we have included some clues and tips. Try to develop the right habits from there, as we will not include them every time!

Set up

# Set working directory
# Watch out for the direction of the slashes, R uses slashes for path, not backslashes 
# Don't forget the quotation marks
setwd("/Users/xinliao/RStudio")

Question 5. Monthly births

Bonus. Data entry

# 数据准备
months = c("January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December")
births_1991 = c(325, 312, 346, 340, 355, 342, 358, 346, 365, 355, 324, 342)
births_1992 = c(334, 304, 360, 330, 361, 333, 352, 350, 357, 345, 332, 325)

Question 5.1. Construct a line graph displaying the reported number of live births over time.

# 数据准备
months = c("January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December")
births_1991 = c(325, 312, 346, 340, 355, 342, 358, 346, 365, 355, 324, 342)
births_1992 = c(334, 304, 360, 330, 361, 333, 352, 350, 357, 345, 332, 325)

# 创建数据框
birth_data = data.frame(
  Month = rep(months, 2),                 # 月份
  Year = rep(c(1991, 1992), each = 12),  # 年份
  Births = c(births_1991, births_1992)   # 出生数量
)
birth_data$Month = factor(
  birth_data$Month,
  levels = months,  # 指定月份的自然顺序
  ordered = TRUE    # 声明为有序因子
)

# 使用ggplot绘图
library(ggplot2)

ggplot(birth_data, aes(x = Month, y = Births, color = as.factor(Year), group = Year)) +
  geom_line(size = 1) +                     # 折线
  geom_point(size = 2) +                    # 数据点
  xlab("Month") +                           # x轴标签
  ylab("Number of Births (thousands)") +    # y轴标签
  ggtitle("Monthly Live Births in the U.S. (1991 vs 1992)") +  # 图标题
  scale_color_manual(values = c("blue", "red"), name = "Year") + # 设置颜色
  theme_minimal() +                         # 最小化主题
  theme(axis.text.x = element_text(angle = 45, hjust = 1))       # 倾斜月份标签

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Question 5.2. Based on the two-year data, do you think the number of live births follows a seasonal pattern in the U.S.?

Based on the data，I find that the number of births is lower in the winter months, especially November through February. The number of births is relatively high in the summer compared to the winter, but still not as high as in the spring and fall. So it could be related to biological, climatic and social factors.

Question 6. State-level nursing home residents

# Import data set
# Don't forget the file extension
setwd("/Users/xinliao/RStudio")
nursing_data = read.csv("nursinghome.csv")
head(nursing_data)

Data check

# Preview the data set


# Plot the main variable(s)
# 快速检查数据集
head(nursing_data)        # 查看前几行

str(nursing_data)         # 数据结构

## 'data.frame':    51 obs. of  2 variables:
##  $ state   : chr  "Alabama" "Alaska" "Arizona" "Arkansas" ...
##  $ resident: num  35.7 22.5 21.7 50.5 26.7 ...

summary(nursing_data)     # 统计摘要

##     state              resident    
##  Length:51          Min.   :13.60  
##  Class :character   1st Qu.:32.85  
##  Mode  :character   Median :44.20  
##                     Mean   :43.89  
##                     3rd Qu.:54.30  
##                     Max.   :74.90

colnames(nursing_data)    # 列名

## [1] "state"    "resident"

dim(nursing_data)         # 数据维度

## [1] 51  2

colSums(is.na(nursing_data))  # 检查缺失值

##    state resident 
##        0        0

Question 6.1. Provide summary statistics for residents. Include the sample size, mean, standard deviation, median, minimum and maximum.

# Summary statistics
n = length(nursing_data$resident)                         # 样本大小
mean_val = mean(nursing_data$resident, na.rm = TRUE)      # 均值
sd_val = sd(nursing_data$resident, na.rm = TRUE)          # 标准差
median_val = median(nursing_data$resident, na.rm = TRUE)  # 中位数
quantiles = quantile(nursing_data$resident, na.rm = TRUE) # 四分位数
min_val = min(nursing_data$resident, na.rm = TRUE)        # 最小值
max_val = max(nursing_data$resident, na.rm = TRUE)        # 最大值

# 显示结果
summary_stats = data.frame(
  Sample_Size = n,
  Mean = mean_val,
  SD = sd_val,
  Median = median_val,
  Min = min_val,
  Max = max_val
)
print(summary_stats)

##   Sample_Size     Mean       SD Median  Min  Max
## 1          51 43.89412 14.59326   44.2 13.6 74.9

# 计算统计量
n <- length(nursing_data$resident)  # 样本大小
mean_val <- mean(nursing_data$resident, na.rm = TRUE)  # 均值
sd_val <- sd(nursing_data$resident, na.rm = TRUE)  # 标准差


cat("The sample size is:", n, "\n")

## The sample size is: 51

cat("The average (± standard deviation) state-level number of nursing home residents is:", 
    round(mean_val, 2), "±", round(sd_val, 2), "\n")

## The average (± standard deviation) state-level number of nursing home residents is: 43.89 ± 14.59

…

Question 6.2. Which state has the lowest and the highest mean number of nursing home residents per 1000 people 65 years of age and over?

min_state = nursing_data[which.min(nursing_data$resident), ]  # 最低居民数
max_state = nursing_data[which.max(nursing_data$resident), ]  # 最高居民数


cat("The state with the lowest number of nursing home residents is:", min_state$state, 
    "with", min_state$resident, "residents.\n")

## The state with the lowest number of nursing home residents is: Hawaii with 13.6 residents.

cat("The state with the highest number of nursing home residents is:", max_state$state, 
    "with", max_state$resident, "residents.\n")

## The state with the highest number of nursing home residents is: South Dakota with 74.9 residents.

Question 6.3. Construct a box plot (with axis and chart titles) for the number of nursing home residents per 1000 population in the United States.

# Box plot 
# 加载 ggplot2 包
library(ggplot2)

# 绘制直方图
ggplot(nursing_data, aes(x = resident)) +
  geom_histogram(binwidth = 10, color = "black", fill = "lightblue") +  # 设置直方图的填充色和边框色
  labs(
    title = "Histogram of Nursing Home Residents per 1000 Population in the U.S.",  # 图表标题
    x = "Residents per 1000 Population",  # x轴标签
    y = "Frequency"  # y轴标签
  ) +
  theme_minimal()  # 使用简洁主题

ggplot(nursing_data, aes(y = resident)) +
  geom_boxplot(fill = "orange") +
  labs(
    title = "Boxplot of Nursing Home Residents per 1000 Population",
    y = "Residents per 1000 Population"
  ) +
  theme_minimal()

Question 6.4. Is the distribution of the number of nursing home residents symmetric or skewed? Describe it.

The distribution of nursing home occupancy is slightly skewed(right). According to the histogram it can be observed that the division is somewhat skewed to the right. Some states have a concentration of residents between 20 and 50, while a small number of states have a higher number of residents (60+), resulting in a slightly longer tail on the right side of the distribution. This suggests that the overall data distribution is slightly skewed towards higher values, but not significantly so.

Question 6.5. Are there any states that could be considered to be outliers?

The distribution of nursing home residents appears consistent across states, meaning no states were identified as outliers.

Question 6.6. Display the number of nursing home residents per 1000 population using a bar graph [with axis and chart titles].

# Bar plot 
library(ggplot2)

# 绘制条形图
ggplot(nursing_data, aes(x = reorder(state, resident), y = resident)) +  # 按居民数量排序
  geom_bar(stat = "identity", fill = "skyblue", color = "black") +  # 填充颜色和边框颜色
  labs(
    title = "Number of Nursing Home Residents per 1000 Population by State",  # 图表标题
    x = "State",  # X轴标签
    y = "Residents per 1000 Population"  # Y轴标签
  ) +
  theme_minimal() +  # 使用简洁主题
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  # 倾斜X轴标签避免重叠

VTPEH6108 - Assignment 01

Xin Liao

2025-01-28