可能用到的包

library(tidyverse)
## Warning: 程序包'tidyverse'是用R版本4.4.3 来建造的
## Warning: 程序包'ggplot2'是用R版本4.4.3 来建造的
## Warning: 程序包'tibble'是用R版本4.4.3 来建造的
## Warning: 程序包'tidyr'是用R版本4.4.3 来建造的
## Warning: 程序包'readr'是用R版本4.4.3 来建造的
## Warning: 程序包'purrr'是用R版本4.4.3 来建造的
## Warning: 程序包'dplyr'是用R版本4.4.3 来建造的
## Warning: 程序包'stringr'是用R版本4.4.3 来建造的
## Warning: 程序包'forcats'是用R版本4.4.3 来建造的
## Warning: 程序包'lubridate'是用R版本4.4.3 来建造的
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)

导入数据“madision_weather.csv”

madision_weather <- read_csv("C:/Rcourse/data/data/madision_weather.csv")
## Rows: 98538 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): station, name
## dbl (12): latitude, longitude, elevation, year, month, day, prcp, snow, snwd...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

数据说明


1.报告数据维度(多少行,多少列)并输出数据的前10行

madision_weather <- read_csv("C:/Rcourse/data/data/madision_weather.csv") # 确保文件路径正确
## Rows: 98538 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): station, name
## dbl (12): latitude, longitude, elevation, year, month, day, prcp, snow, snwd...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dim(madision_weather)
## [1] 98538    14
head(madision_weather, 10)
## # A tibble: 10 × 14
##    station     name   latitude longitude elevation  year month   day  prcp  snow
##    <chr>       <chr>     <dbl>     <dbl>     <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 USC00470273 UW AR…     43.0     -89.4      265.  1971    10     1  0       NA
##  2 USC00470273 UW AR…     43.0     -89.4      265.  1971    10     2  0       NA
##  3 USC00470273 UW AR…     43.0     -89.4      265.  1971    10     3  0.15    NA
##  4 USC00470273 UW AR…     43.0     -89.4      265.  1971    10     4  0.02    NA
##  5 USC00470273 UW AR…     43.0     -89.4      265.  1971    10     5  0.01    NA
##  6 USC00470273 UW AR…     43.0     -89.4      265.  1971    10     6  0.25    NA
##  7 USC00470273 UW AR…     43.0     -89.4      265.  1971    10     7  0.04    NA
##  8 USC00470273 UW AR…     43.0     -89.4      265.  1971    10     8  0.05    NA
##  9 USC00470273 UW AR…     43.0     -89.4      265.  1971    10     9  0       NA
## 10 USC00470273 UW AR…     43.0     -89.4      265.  1971    10    10  0       NA
## # ℹ 4 more variables: snwd <dbl>, tavg <dbl>, tmax <dbl>, tmin <dbl>


2.Madision总共有几个气象站?并输出每个气象站的详细信息(包括纬度、经度、海拔)

madision_weather %>%
  distinct(station, name, latitude, longitude, elevation)
## # A tibble: 4 × 5
##   station     name                                  latitude longitude elevation
##   <chr>       <chr>                                    <dbl>     <dbl>     <dbl>
## 1 USC00470273 UW ARBORETUM MADISON, WI US               43.0     -89.4      265.
## 2 USW00014837 MADISON DANE CO REGIONAL AIRPORT, WI…     43.1     -89.3      262.
## 3 USC00471416 CHARMANY FARM, WI US                      43.1     -89.5      319.
## 4 USC00474966 MADISON WEATHER BUREAU CITY, WI US        43.1     -89.4      297.


3.查看”USC00471416”气象站2020年降水量、降雪量、积雪厚度的缺失数量

madision_weather %>%
  filter(station == "USC00471416", year == 2020) %>%
  summarise(prcp_na = sum(is.na(prcp)),
           snow_na = sum(is.na(snow)),
           snwd_na = sum(is.na(snwd)))
## # A tibble: 1 × 3
##   prcp_na snow_na snwd_na
##     <int>   <int>   <int>
## 1       5       2       0


4.查看每个气象站统计的起始日期(命名:first_date)和最后日期(命名:last_date),气象站有记录的天数(命名:days_n),应该记录的天数(命名:days_possible),缺失的天数(命名:days_missing)

madision_weather <- madision_weather %>%
  mutate(date = make_date(year, month, day))  
madision_weather %>%
  group_by(station) %>%                     
  summarise(
    first_date = min(date, na.rm = TRUE),   
    last_date = max(date, na.rm = TRUE),   
    days_n = n(),
    days_possible = as.numeric(last_date - first_date) + 1,
    days_missing = days_possible - days_n
  )
## # A tibble: 4 × 6
##   station     first_date last_date  days_n days_possible days_missing
##   <chr>       <date>     <date>      <int>         <dbl>        <dbl>
## 1 USC00470273 1971-10-01 2021-12-31  18179         18355          176
## 2 USC00471416 1959-08-01 2021-12-31  21410         22799         1389
## 3 USC00474966 1884-01-01 1963-05-31  28906         29005           99
## 4 USW00014837 1939-10-01 2021-12-31  30043         30043            0


5.将3个温度指标(tmax,tmin,tavg)从华氏度换算成摄氏度,并替代原来的变量(C = 5*(F-32)/9, C为摄氏度,F为华氏度)

madision_weather <- madision_weather %>%
  mutate(across(c(tmax, tmin, tavg), ~ 5*(. -32)/9, .names = "{.col}"))


6.①Madision地区2000年的最高气温和最低气温是多少,分别是哪一天哪一个气象站观测到的?②计算2000年每个气象站最高气温和最低气温的差值并排序

# ① 
madision_weather <- madision_weather %>%
  mutate(date = make_date(year, month, day))

madision_weather %>%
  filter(year == 2000) %>%
  filter(
    tmax == max(tmax, na.rm = TRUE) |  
    tmin == min(tmin, na.rm = TRUE)     
  ) %>%
  select(date, station, tmax, tmin)
## # A tibble: 2 × 4
##   date       station      tmax  tmin
##   <date>     <chr>       <dbl> <dbl>
## 1 2000-09-02 USC00470273  33.9  20  
## 2 2000-12-25 USW00014837 -12.8 -29.4
madision_weather %>%
  filter(year == 2000) %>%
  group_by(station) %>%
  summarise(
    max_tmax = max(tmax, na.rm = TRUE),  
    min_tmin = min(tmin, na.rm = TRUE), 
    t_diff = max_tmax - min_tmin       
  ) %>%
  arrange(desc(t_diff))    
## # A tibble: 3 × 4
##   station     max_tmax min_tmin t_diff
##   <chr>          <dbl>    <dbl>  <dbl>
## 1 USW00014837     33.3    -29.4   62.8
## 2 USC00470273     33.9    -28.3   62.2
## 3 USC00471416     32.8    -21.1   53.9


7.筛选出平均气温最高的10年,这10年中有多少年是2000年以后的?

madision_weather %>%
  group_by(year) %>%
  summarise(mean_tavg = mean(tavg, na.rm = TRUE)) %>%
  slice_max(mean_tavg, n = 10) %>%
  summarise(post_2000 = sum(year > 2000))
## # A tibble: 1 × 1
##   post_2000
##       <int>
## 1         4


8.为数据集添加日温差变量(命名:tdiff),“是否下雪”指示变量(命名:is_snow,提示:降雪量大于0)

madision_weather <- madision_weather %>%
  mutate(tdiff = tmax - tmin,
         is_snow = snow > 0)


9.计算各气象站的平均气温(命名:station_tavg)和平均温差(命名:station_tdiff),并按气象站的海拔排序。

madision_weather %>%
  group_by(station, elevation) %>%
  summarise(station_tavg = mean(tavg, na.rm = TRUE),
            station_tdiff = mean(tdiff, na.rm = TRUE)) %>%
  arrange(elevation)
## `summarise()` has grouped output by 'station'. You can override using the
## `.groups` argument.
## # A tibble: 4 × 4
## # Groups:   station [4]
##   station     elevation station_tavg station_tdiff
##   <chr>           <dbl>        <dbl>         <dbl>
## 1 USW00014837      262.         8.02         11.2 
## 2 USC00470273      265.         7.95         12.4 
## 3 USC00474966      297.         7.99          9.07
## 4 USC00471416      319.         7.98         10.6


10.以气象站”USC00470273”的数据代表madision地区的天气,将数据聚合到年层面,并计算年降水量(prcp_year)、年降雪量(snow_year)、降雪天数(snow_days)、年均气温(tavg_year),保存数据并命名为madision_weather_panel。

madision_weather_panel <- madision_weather %>%
  filter(station == "USC00470273") %>%
  group_by(year) %>%
  summarise(
    prcp_year = sum(prcp, na.rm = TRUE),
    snow_year = sum(snow, na.rm = TRUE),
    snow_days = sum(is_snow, na.rm = TRUE),
    tavg_year = mean(tavg, na.rm = TRUE)
  )


11.根据上题得到的数据,研究Madision的气温逐年变化趋势。

lm(tavg_year ~ year, data = madision_weather_panel) %>% summary()
## 
## Call:
## lm(formula = tavg_year ~ year, data = madision_weather_panel)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.80575 -0.64707  0.01282  0.71645  1.94078 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -90.33559   20.91056  -4.320 7.59e-05 ***
## year          0.04921    0.01048   4.697 2.17e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.101 on 49 degrees of freedom
## Multiple R-squared:  0.3105, Adjusted R-squared:  0.2964 
## F-statistic: 22.06 on 1 and 49 DF,  p-value: 2.168e-05


12.这里定义了季节划分函数并添加季节列,请你计算每个季节的平均降水量、平均降雪量、平均气温,并从降水量、降雪量、气温几个指标中选择一个指标,绘制季节变化折线图

get_season <- function(month) {
  case_when(
    month %in% 3:5 ~ "Spring",
    month %in% 6:8 ~ "Summer",
    month %in% 9:11 ~ "Fall",
    TRUE ~ "Winter"
  )
}

madision_weather_season <- madision_weather %>%
  mutate(season = get_season(month))
season_stats <- madision_weather_season %>%
  group_by(season) %>%
  summarise(
    mean_prcp = mean(prcp, na.rm = TRUE),
    mean_snow = mean(snow, na.rm = TRUE),
    mean_tavg = mean(tavg, na.rm = TRUE)
  )

# 示例绘图(气温季节变化)
ggplot(season_stats, aes(x = season, y = mean_tavg, group = 1)) +
  geom_line() +
  labs(title = "Seasonal Temperature Variation")


13.根据本次作业出现的各种数据,提出一个你感兴趣的问题并进行简单研究。

madision_weather %>%
  group_by(station) %>%
  summarise(
    elevation = first(elevation),
    mean_tdiff = mean(tdiff, na.rm = TRUE)
  ) %>%
  lm(mean_tdiff ~ elevation, data = .) %>% summary()
## 
## Call:
## lm(formula = mean_tdiff ~ elevation, data = .)
## 
## Residuals:
##       1       2       3       4 
##  0.9087  0.8152 -1.4128 -0.3111 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept) 19.56647    8.14714   2.402    0.138
## elevation   -0.03060    0.02843  -1.077    0.394
## 
## Residual standard error: 1.338 on 2 degrees of freedom
## Multiple R-squared:  0.3669, Adjusted R-squared:  0.0503 
## F-statistic: 1.159 on 1 and 2 DF,  p-value: 0.3943