#Lessons:Plotting histograms to see distribution. #Tests of normality when histograms are insufficient #Mutate to create a new column with average of two other columns #Making prettier ggplots #kolmogorov smirnof test #QQplot
#Load data and packages
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidytuesdayR)
tuesdata <- tidytuesdayR::tt_load(2025, week = 42)
## ---- Compiling #TidyTuesday Information for 2025-10-21 ----
## --- There are 2 files available ---
##
##
## ── Downloading files ───────────────────────────────────────────────────────────
##
## 1 of 2: "historic_station_met.csv"
## 2 of 2: "station_meta.csv"
climate <- tuesdata$historic_station_met
view(climate)
#Distribution of tmax
ggplot(climate, aes(x = tmax))+
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 928 rows containing non-finite outside the scale range
## (`stat_bin()`).
#Distribution of tmin
ggplot(climate, aes(x = tmin))+
geom_histogram(bins = 40)
## Warning: Removed 902 rows containing non-finite outside the scale range
## (`stat_bin()`).
#Distribution of rain
ggplot(climate, aes(x = rain))+
geom_histogram(bins = 100)
## Warning: Removed 873 rows containing non-finite outside the scale range
## (`stat_bin()`).
#What a lovely right skew.Tail to right is right.
#Can we make the graph prettier?
ggplot(climate, aes(x = rain))+
geom_histogram(bins = 100, fill = "steelblue")+
labs(x = "Rainfall in mm", )+
theme_minimal()
## Warning: Removed 873 rows containing non-finite outside the scale range
## (`stat_bin()`).
#To get a new column with temperature means
climate <- climate |>
mutate(tavg = (tmax + tmin) /2)
ggplot(climate, aes(x = tavg))+
geom_histogram(bins= 60, fill = "steelblue")
## Warning: Removed 956 rows containing non-finite outside the scale range
## (`stat_bin()`).
#All temp columns have bimodal distribution #multiple tests are
available to see which distribution fits the data best; if it is really
bimodal
#How does the sunshine look like?
ggplot(climate, aes(x = sun))+
geom_histogram(bins = 100, fill = "steelblue")
## Warning: Removed 9168 rows containing non-finite outside the scale range
## (`stat_bin()`).
#yellow looked too ugly #dont know how good a normal distribution this
is.
#The Shapiro-Wilk test is generally more powerful and preferred for smaller sample sizes (under 50), while the Kolmogorov-Smirnov test is often recommended for larger samples (over 50) and is a more general goodness-of-fit test. So going with latter here. ks.test(data,“pnorm”) is the syntax. It needs expected normal distribution.Use Mean = 0 and SD =1.
ks_test <- ks.test(climate$rain, "pnorm", mean = 0, sd = 1)
## Warning in ks.test.default(climate$rain, "pnorm", mean = 0, sd = 1): ties
## should not be present for the one-sample Kolmogorov-Smirnov test
ks_test
##
## Asymptotic one-sample Kolmogorov-Smirnov test
##
## data: climate$rain
## D = 0.99622, p-value < 2.2e-16
## alternative hypothesis: two-sided
#“Ties should not be present for the one-sample Kolmogorov-Smirnov test” warning: data has duplicates. A significant p value suggest you need to reject the null hypothesis. Alt hyp: there is a difference. The data on rain is not normally distributed.
#qqplot
# Create the normal Q-Q plot
qqnorm(climate$rain)
# Add a reference line for easier interpretation
qqline(climate$rain, col = "blue")
#Definitely not normal.