Distribution

#Lessons:Plotting histograms to see distribution. #Tests of normality when histograms are insufficient #Mutate to create a new column with average of two other columns #Making prettier ggplots #kolmogorov smirnof test #QQplot

#Load data and packages

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(tidytuesdayR)
tuesdata <- tidytuesdayR::tt_load(2025, week = 42)

## ---- Compiling #TidyTuesday Information for 2025-10-21 ----
## --- There are 2 files available ---
## 
## 
## ── Downloading files ───────────────────────────────────────────────────────────
## 
##   1 of 2: "historic_station_met.csv"
##   2 of 2: "station_meta.csv"

climate <- tuesdata$historic_station_met
view(climate)

#Distribution of tmax

ggplot(climate, aes(x = tmax))+
  geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 928 rows containing non-finite outside the scale range
## (`stat_bin()`).

#Distribution of tmin

ggplot(climate, aes(x = tmin))+
  geom_histogram(bins = 40)

## Warning: Removed 902 rows containing non-finite outside the scale range
## (`stat_bin()`).

#Distribution of rain

ggplot(climate, aes(x = rain))+
  geom_histogram(bins = 100)

## Warning: Removed 873 rows containing non-finite outside the scale range
## (`stat_bin()`).

#What a lovely right skew.Tail to right is right.

#Can we make the graph prettier?

ggplot(climate, aes(x = rain))+
  geom_histogram(bins = 100, fill = "steelblue")+
  labs(x =  "Rainfall in mm", )+
  theme_minimal()

## Warning: Removed 873 rows containing non-finite outside the scale range
## (`stat_bin()`).

#To get a new column with temperature means

climate <- climate |> 
  mutate(tavg = (tmax + tmin) /2) 

ggplot(climate, aes(x = tavg))+
  geom_histogram(bins= 60, fill = "steelblue")

## Warning: Removed 956 rows containing non-finite outside the scale range
## (`stat_bin()`).

#All temp columns have bimodal distribution #multiple tests are available to see which distribution fits the data best; if it is really bimodal

#How does the sunshine look like?

ggplot(climate, aes(x = sun))+
  geom_histogram(bins = 100, fill = "steelblue")

## Warning: Removed 9168 rows containing non-finite outside the scale range
## (`stat_bin()`).

#yellow looked too ugly #dont know how good a normal distribution this is.

#The Shapiro-Wilk test is generally more powerful and preferred for smaller sample sizes (under 50), while the Kolmogorov-Smirnov test is often recommended for larger samples (over 50) and is a more general goodness-of-fit test. So going with latter here. ks.test(data,“pnorm”) is the syntax. It needs expected normal distribution.Use Mean = 0 and SD =1.

ks_test <- ks.test(climate$rain, "pnorm", mean = 0, sd = 1)

## Warning in ks.test.default(climate$rain, "pnorm", mean = 0, sd = 1): ties
## should not be present for the one-sample Kolmogorov-Smirnov test

ks_test

## 
##  Asymptotic one-sample Kolmogorov-Smirnov test
## 
## data:  climate$rain
## D = 0.99622, p-value < 2.2e-16
## alternative hypothesis: two-sided

#“Ties should not be present for the one-sample Kolmogorov-Smirnov test” warning: data has duplicates. A significant p value suggest you need to reject the null hypothesis. Alt hyp: there is a difference. The data on rain is not normally distributed.

#qqplot

# Create the normal Q-Q plot
qqnorm(climate$rain)

# Add a reference line for easier interpretation
qqline(climate$rain, col = "blue")

#Definitely not normal.

Distribution

AlearnsR

2025-10-21