#Q1泊松分布与地震次数 #(a)

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.1     ✔ stringr   1.5.2
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)
library(cowplot)
## 
## Attaching package: 'cowplot'
## 
## The following object is masked from 'package:lubridate':
## 
##     stamp
library(ggrepel)
library(scales)
## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor
library(dplyr)
library(reshape2)
## 
## Attaching package: 'reshape2'
## 
## The following object is masked from 'package:tidyr':
## 
##     smiths
alldata <- read_xlsx("earthquake_2015_2024.xlsx")
colnames(alldata) <- c("id", "time", "lon", "lat", "center_depth", "level", "location","type")
alldata$level = as.numeric(alldata$level)
data = alldata[alldata$level >= 4 & alldata$id <= 1066,]
monthly_counts <- data %>%
  mutate(
    Year = year(time),
    Month = month(time)
  )

tempfile <- monthly_counts %>% 
  group_by(Month) %>%
  summarise(
    count = n(),
  )
plot(tempfile$Month, tempfile$count, type="l")

#(b)

library(readxl)
library(dplyr)
alldata <- read_xlsx("earthquake_2015_2024.xlsx")
colnames(alldata) <- c("id", "time", "lon", "lat", "center_depth", "level", "location","type")
alldata$level = as.numeric(alldata$level)
data = alldata[alldata$level >= 4,]
monthly_counts <- data %>%
  mutate(
    Year = year(time),
    Month = month(time)
  )

tempfile <- monthly_counts %>% 
  group_by(Year ,Month) %>%
  summarise(
    frequency = n(),
  )
## `summarise()` has grouped output by 'Year'. You can override using the
## `.groups` argument.
tempfile1 <- tempfile %>%
  group_by(frequency) %>%
  summarise(
    count = n(),
  )
total <- sum(tempfile1$count)
probability <- tempfile1$count / total
Expected_value <- sum(tempfile1$frequency * probability)
cat("估计λ的值为:",Expected_value)
## 估计λ的值为: 14.525

#(c)

lambda <- 14.525
p1 <- ppois(0, lambda, lower.tail = TRUE, log.p = FALSE)
p2 <- ppois(4, lambda, lower.tail = TRUE, log.p = FALSE)
p3 <- ppois(20, lambda, lower.tail = FALSE, log.p = FALSE)
cat("根据泊松分布近似,一个月内没有发生震级超过四级的地震的概率为:",p1,"一个月发生小于5次震级超过四级的地震的概率为:",p2,"一个月发生超过20次震级超过四级的地震的概率为:",p3)
## 根据泊松分布近似,一个月内没有发生震级超过四级的地震的概率为: 4.918953e-07 一个月发生小于5次震级超过四级的地震的概率为: 0.00122303 一个月发生超过20次震级超过四级的地震的概率为: 0.0646632

#Q2正态概率与分位数 #前置运算

library(HistData)
library(dplyr)
data(Galton)
Galton$child = as.numeric(Galton$child)
Data1 <- Galton %>%
  group_by(child) %>%
  summarise(
    count = n()
  )

#(a)

totalnumber <- sum(Data1$count)
totalheight <- sum(Data1$child * Data1$count)
average <- totalheight/totalnumber
probability1 <- Data1$count/totalnumber
variance <- sum(probability1 * (Data1$child - average) * (Data1$child - average))
sd1 <- sqrt(variance)
cat("样本均值为:",average,"方差为",variance,"拟合的正态分布为X~N(",average,",",variance,")" )
## 样本均值为: 68.08847 方差为 6.333197 拟合的正态分布为X~N( 68.08847 , 6.333197 )

#(b)

hist(Galton$child,
     main = "身高的频率分布直方图",
     xlab = "身高",
     ylab = "频率",
     col = "lightblue",
     border = "black")

curve(dnorm(x,mean = 68.09,sd = sd1),from = 60,to = 76,
      main = "身高的正态分布密度曲线",
      xlab = "身高",
      ylab = "密度"
      )

#(c)

a <- 66
b <- 71
c <- 65
d <- 72
e <- 74
p1 <- pnorm(b, mean = average, sd = sd1) - pnorm(a, mean = average, sd = sd1)
p2 <- pnorm(d, mean = average, sd = sd1) - pnorm(c, mean = average, sd = sd1)
p3 <- 1 - pnorm(e, mean = average, sd = sd1)
cat("介于区间[66,71]的概率为",p1,"介于区间[65,72]的概率为",p2,"身高大于74inch的概率为",p3)
## 介于区间[66,71]的概率为 0.6730484 介于区间[65,72]的概率为 0.8300788 身高大于74inch的概率为 0.009411215

#(d)

probs <- seq(0.05,0.95,by = 0.05)
expected1 <- qnorm(probs, mean = average, sd = sd1, lower.tail = FALSE, log.p = FALSE)
expected2 <- qnorm(probs, mean = average, sd = sd1, lower.tail = TRUE, log.p = FALSE)

#(e)

probs <- seq(0.05,0.95,by = 0.05)
experience <- quantile(Galton$child, probs)
plot(probs,experience,type = "l",col = "blue",lwd = 2,
      xlab = "概率", ylab = "分位数值",
     main = "正态分位数与经验分位数比较",
     ylim = range(c(experience, expected2)))
lines(probs, expected2, col = "red", lwd = 2, lty = 2)
legend("topleft", 
       legend = c("经验分位数", "正态分位数"),
       col = c("blue", "red"), 
       lwd = 2, lty = c(1, 2),
       bty = "n")