For the first assignment, we have included some clues and tips. Try to develop the right habits from there, as we will not include them every time!
# Set working directory
# Watch out for the direction of the slashes, R uses slashes for path, not backslashes
# Don't forget the quotation marks
setwd("/Users/xinliao/RStudio")
# 数据准备
months = c("January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December")
births_1991 = c(325, 312, 346, 340, 355, 342, 358, 346, 365, 355, 324, 342)
births_1992 = c(334, 304, 360, 330, 361, 333, 352, 350, 357, 345, 332, 325)
# 数据准备
months = c("January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December")
births_1991 = c(325, 312, 346, 340, 355, 342, 358, 346, 365, 355, 324, 342)
births_1992 = c(334, 304, 360, 330, 361, 333, 352, 350, 357, 345, 332, 325)
# 创建数据框
birth_data = data.frame(
Month = rep(months, 2), # 月份
Year = rep(c(1991, 1992), each = 12), # 年份
Births = c(births_1991, births_1992) # 出生数量
)
birth_data$Month = factor(
birth_data$Month,
levels = months, # 指定月份的自然顺序
ordered = TRUE # 声明为有序因子
)
# 使用ggplot绘图
library(ggplot2)
ggplot(birth_data, aes(x = Month, y = Births, color = as.factor(Year), group = Year)) +
geom_line(size = 1) + # 折线
geom_point(size = 2) + # 数据点
xlab("Month") + # x轴标签
ylab("Number of Births (thousands)") + # y轴标签
ggtitle("Monthly Live Births in the U.S. (1991 vs 1992)") + # 图标题
scale_color_manual(values = c("blue", "red"), name = "Year") + # 设置颜色
theme_minimal() + # 最小化主题
theme(axis.text.x = element_text(angle = 45, hjust = 1)) # 倾斜月份标签
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
Based on the data,I find that the number of births is lower in the winter months, especially November through February. The number of births is relatively high in the summer compared to the winter, but still not as high as in the spring and fall. So it could be related to biological, climatic and social factors.
# Import data set
# Don't forget the file extension
setwd("/Users/xinliao/RStudio")
nursing_data = read.csv("nursinghome.csv")
head(nursing_data)
Data check
# Preview the data set
# Plot the main variable(s)
# 快速检查数据集
head(nursing_data) # 查看前几行
str(nursing_data) # 数据结构
## 'data.frame': 51 obs. of 2 variables:
## $ state : chr "Alabama" "Alaska" "Arizona" "Arkansas" ...
## $ resident: num 35.7 22.5 21.7 50.5 26.7 ...
summary(nursing_data) # 统计摘要
## state resident
## Length:51 Min. :13.60
## Class :character 1st Qu.:32.85
## Mode :character Median :44.20
## Mean :43.89
## 3rd Qu.:54.30
## Max. :74.90
colnames(nursing_data) # 列名
## [1] "state" "resident"
dim(nursing_data) # 数据维度
## [1] 51 2
colSums(is.na(nursing_data)) # 检查缺失值
## state resident
## 0 0
# Summary statistics
n = length(nursing_data$resident) # 样本大小
mean_val = mean(nursing_data$resident, na.rm = TRUE) # 均值
sd_val = sd(nursing_data$resident, na.rm = TRUE) # 标准差
median_val = median(nursing_data$resident, na.rm = TRUE) # 中位数
quantiles = quantile(nursing_data$resident, na.rm = TRUE) # 四分位数
min_val = min(nursing_data$resident, na.rm = TRUE) # 最小值
max_val = max(nursing_data$resident, na.rm = TRUE) # 最大值
# 显示结果
summary_stats = data.frame(
Sample_Size = n,
Mean = mean_val,
SD = sd_val,
Median = median_val,
Min = min_val,
Max = max_val
)
print(summary_stats)
## Sample_Size Mean SD Median Min Max
## 1 51 43.89412 14.59326 44.2 13.6 74.9
# 计算统计量
n <- length(nursing_data$resident) # 样本大小
mean_val <- mean(nursing_data$resident, na.rm = TRUE) # 均值
sd_val <- sd(nursing_data$resident, na.rm = TRUE) # 标准差
cat("The sample size is:", n, "\n")
## The sample size is: 51
cat("The average (± standard deviation) state-level number of nursing home residents is:",
round(mean_val, 2), "±", round(sd_val, 2), "\n")
## The average (± standard deviation) state-level number of nursing home residents is: 43.89 ± 14.59
…
min_state = nursing_data[which.min(nursing_data$resident), ] # 最低居民数
max_state = nursing_data[which.max(nursing_data$resident), ] # 最高居民数
cat("The state with the lowest number of nursing home residents is:", min_state$state,
"with", min_state$resident, "residents.\n")
## The state with the lowest number of nursing home residents is: Hawaii with 13.6 residents.
cat("The state with the highest number of nursing home residents is:", max_state$state,
"with", max_state$resident, "residents.\n")
## The state with the highest number of nursing home residents is: South Dakota with 74.9 residents.
# Box plot
# 加载 ggplot2 包
library(ggplot2)
# 绘制直方图
ggplot(nursing_data, aes(x = resident)) +
geom_histogram(binwidth = 10, color = "black", fill = "lightblue") + # 设置直方图的填充色和边框色
labs(
title = "Histogram of Nursing Home Residents per 1000 Population in the U.S.", # 图表标题
x = "Residents per 1000 Population", # x轴标签
y = "Frequency" # y轴标签
) +
theme_minimal() # 使用简洁主题
ggplot(nursing_data, aes(y = resident)) +
geom_boxplot(fill = "orange") +
labs(
title = "Boxplot of Nursing Home Residents per 1000 Population",
y = "Residents per 1000 Population"
) +
theme_minimal()
The distribution of nursing home occupancy is slightly skewed(right). According to the histogram it can be observed that the division is somewhat skewed to the right. Some states have a concentration of residents between 20 and 50, while a small number of states have a higher number of residents (60+), resulting in a slightly longer tail on the right side of the distribution. This suggests that the overall data distribution is slightly skewed towards higher values, but not significantly so.
The distribution of nursing home residents appears consistent across states, meaning no states were identified as outliers.
# Bar plot
library(ggplot2)
# 绘制条形图
ggplot(nursing_data, aes(x = reorder(state, resident), y = resident)) + # 按居民数量排序
geom_bar(stat = "identity", fill = "skyblue", color = "black") + # 填充颜色和边框颜色
labs(
title = "Number of Nursing Home Residents per 1000 Population by State", # 图表标题
x = "State", # X轴标签
y = "Residents per 1000 Population" # Y轴标签
) +
theme_minimal() + # 使用简洁主题
theme(axis.text.x = element_text(angle = 45, hjust = 1)) # 倾斜X轴标签避免重叠