# Add any other packages you need to load here.
library(tidyverse)
# Read in the data
uksun <- read_csv("https://www.massey.ac.nz/~jcmarsha/161777/assessment/data/uksun.csv")
woops <- read_csv("https://www.massey.ac.nz/~jcmarsha/161777/assessment/data/woops2025.csv")
library(naniar) #load the naniar package
## Warning: 程序包'naniar'是用R版本4.4.3 来建造的
uksun|> #read dataset
gg_miss_upset() #visualize missing data
## `geom_line()`: Each group consists of only one observation.
## ℹ Do you need to adjust the group aesthetic?
#sun.Durham and sun.Sheffield have missing data #sun.Durham has 159
points missing #sun.Sheffield has 1 point missing
library(Hmisc) #load the Hmisc package
## Warning: 程序包'Hmisc'是用R版本4.4.3 来建造的
##
## 载入程序包:'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
uksun|>#read dataset
mutate(sun.Sheffield=impute_mean(sun.Sheffield),sun.Durham=impute_mean(sun.Durham))|> #replace the missing values in sun.Durham and sun.Sheffield with the mean of the column
summarise(mean_sun.Oxford=mean(sun.Oxford),
mean_sun.Sheffield=mean(sun.Sheffield),
mean_sun.Durham=mean(sun.Durham)) #calculate the mean of sun.Oxford, the imputed sun.Sheffield and sun.Durham columns.
## # A tibble: 1 × 3
## mean_sun.Oxford mean_sun.Sheffield mean_sun.Durham
## <dbl> <dbl> <dbl>
## 1 128. 111. 112.
#sun.Oxford has the highest mean sunshine hours #sun.Sheffield has the lowest mean sunshine hours
uksun_monthly_avg1 <- uksun|> #given data from uksun to uksun_monthly_avg1
select(-sun.Durham,-sun.Oxford,-sun.Sheffield) |> #removes the columns sun.Durham, sun.Oxford and sun.Sheffield from the dataset
group_by(month)|> #group the dataset by the month column
summarise(rain_Oxford=mean(rain.Oxford,na.rm=TRUE),
rain_Durham=mean(rain.Durham, na.rm=TRUE),
rain_Sheffield=mean(rain.Sheffield,na.rm=TRUE)) #calculate the mean of rainfall for each city, and ensure the missing values are ignored during calculation
uksun_monthly_avg_long1 <- uksun_monthly_avg1 |> #given data from uksun_monthly_avg1 to uksun_monthly_avg_long1
pivot_longer(cols=starts_with("rain"),
names_to="city",
values_to="rain")|> #convert the wide-format data into long-format
select(month, city, rain) #keep month, city and rain columns
print(uksun_monthly_avg1) #print the dataset
## # A tibble: 12 × 4
## month rain_Oxford rain_Durham rain_Sheffield
## <dbl> <dbl> <dbl> <dbl>
## 1 1 57.6 56.9 82.0
## 2 2 41.9 45.8 63.8
## 3 3 44.7 42.8 58.5
## 4 4 45.8 46.0 58.5
## 5 5 54.9 48.9 59.2
## 6 6 52.1 53.0 61.9
## 7 7 52.6 59.3 62.5
## 8 8 59.7 67.0 66.3
## 9 9 56.1 57.6 62.5
## 10 10 62.7 58.6 73.9
## 11 11 66.1 67.8 84.1
## 12 12 62.4 58.1 81.7
rainfall<-uksun|> #give the data from uksun to rainfall
select(year,month,rain.Durham,rain.Oxford,rain.Sheffield)|> #choose year,month,rain.Durham,rain.Oxford,rain.Sheffield columns
pivot_longer(cols = starts_with("rain"),
names_to = "city",
values_to = "value")|> #convert the wide-format data into long-format
mutate(month = factor(month, levels = 1:12, labels = month.abb)) #transform 1-12 into Jan-Dec
rainfall_plot<-rainfall|> #give the data from rainfall to rainfall_plot
ggplot(aes(x = month, y = value, group = year))+ #setup basic ggplot
geom_line(color = "black", alpha = 0.3)+ #add individual lines
facet_wrap(~city,ncol=1,labeller = labeller(city = function(x) gsub("rain.", "", x)))+ #create multiple subplots facet by city
geom_smooth(aes(group = city, color = city), method = "loess", se = FALSE, size = 1, span = 0.5)+ #add smoothed trend lines
scale_color_manual(values = c("rain.Durham" = "steelblue", "rain.Oxford" = "brown", "rain.Sheffield" = "tan"))+ #choose the lines' colors
scale_y_continuous(breaks = seq(0, 200, by = 100),
limits = c(0, 200)) + #adjust Y-axis scale
labs(x = "", y = "Monthly rainfall") + #add axis labels
theme_minimal() + #remove extra grid lines and background
theme(axis.text.x = element_text(angle = 45, hjust = 1),
strip.position = "top",
legend.position = "none") #decide plot appearance
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
sunshine<-uksun|> #same like rainfall
select(year,month,sun.Durham,sun.Oxford,sun.Sheffield)|>
pivot_longer(cols = starts_with("sun"),
names_to = "city",
values_to = "value")|>
mutate(month = factor(month, levels = 1:12, labels = month.abb))
sunshine_plot<-sunshine|>
ggplot(aes(x = month, y = value, group = year))+
geom_line(color = "black", alpha = 0.3)+
facet_wrap(~city,ncol=1,labeller = labeller(city = function(x) gsub("rain.", "", x)))+
geom_smooth(aes(group = city, color = city), method = "loess", se = FALSE, size = 1, span = 0.5)+
scale_color_manual(values = c("sun.Durham" = "steelblue", "sun.Oxford" = "brown", "sun.Sheffield" = "tan"))+
scale_y_continuous(breaks = seq(0, 300, by = 100),
limits = c(0, 300)) +
labs(x = "", y = "Monthly sunshine") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
strip.position = "top",
legend.position = "none" )
library(patchwork)
## Warning: 程序包'patchwork'是用R版本4.4.3 来建造的
rainfall_plot+sunshine_plot #combine rainfall_plot and sunshine_plot
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 5 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning in plot_theme(plot): The `strip.position` theme element is not defined
## in the element hierarchy.
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 163 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning in plot_theme(plot): The `strip.position` theme element is not defined
## in the element hierarchy.
## Warning: Removed 156 rows containing missing values or values outside the scale range
## (`geom_line()`).
uksun|> #read dataset
filter(month=="7")|> #select rows
select(year,sun.Durham)|> #keep year and sun.Durham columns
mutate(missing = is.na(sun.Durham),
sun.Durham=impute_mean(sun.Durham))|> #create a new column missing to indicate whether sun.Durham is NA, and imputes missing values in sun.Durham using the mean
ggplot(aes(x = year, y = sun.Durham, fill = missing)) + #setup basic ggplot
geom_col(color = "black", alpha = 0.7) + #creat a bar chart
labs(title = "Durham Sunshine in July",
x = "Year",
y = "Sunshine (hours)") + #set labs
scale_fill_manual(values = c("TRUE" = "brown", "FALSE" = "steelblue")) + #choose color
theme_minimal() #remove extra grid lines and background
sunshine_before<-uksun|> #give the data from uksun to sunshine_before
filter(month=="7")|> #select rows
select(year,sun.Durham)|> #keep year and sun.Durham columns
filter(!is.na(sun.Durham)) #exclude rows where has missing (NA) values
sunshine_before_plot <- sunshine_before|> #give the data from sunshine_before to sunshine_before_plot
ggplot(aes(y=sun.Durham))+ #setup basic ggplot
geom_boxplot(fill = "steelblue",alpha=0.7)+ #create box-plot, and choose color
labs(title = "Before Imputation",
y = "Sunshine Hours") +
labs(subtitle="Durham Sunshine in July")+ #set labs
theme_minimal() #remove extra grid lines and background
sunshine_after<-uksun|> #same like sunshine_before
filter(month == "7")|>
select(year, sun.Durham)|>
mutate(sun.Durham=ifelse(is.na(sun.Durham),mean(sun.Durham, na.rm = TRUE), sun.Durham))
sunshine_after_plot <- sunshine_after|>
ggplot(aes(y=sun.Durham))+
geom_boxplot(fill = "brown",alpha=0.7)+
labs(title = "After Imputation",
y = "Sunshine Hours") +
labs(subtitle="Durham Sunshine in July")+
theme_minimal()
library(patchwork)
sunshine_before_plot+sunshine_after_plot #combine sunshine_before_plot and sunshine_after_plot
#The dataset “Before Imputation” has a greater degree of dispersion.
Judging from the interquartile range (IQR), the box length of “Before
Imputation” is relatively longer, indicating that the middle 50% of the
data in this dataset is more dispersed. In terms of outliers, in the
“Before Imputation” boxplot, the outlier points are far from the box,
suggesting that there are some extreme values deviating from the main
body of the data. Although the “After Imputation” dataset also has
outliers, the data as a whole is relatively more concentrated around the
box.
library(VIM)
## Warning: 程序包'VIM'是用R版本4.4.3 来建造的
## 载入需要的程序包:colorspace
## 载入需要的程序包:grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## 载入程序包:'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
uksun_k5<-uksun|>#give data from uksun to uksun_k5
select(-year,-month)|> #remove year and month columns
kNN(k=5) #calculate the mean of 5 the most similar datas to fill NA
uksun_k100<-uksun|>
select(-year,-month)|>
kNN(k=100)
# extract interpolated sunshine data for Durham and Oxford
K5<- data.frame(Durham_sunshine = uksun_k5$sun.Durham,
Oxford_sunshine = uksun_k5$sun.Oxford,
imputed = ifelse(is.na(uksun$sun.Durham), TRUE, FALSE))
K100<-data.frame(Durham_sunshine = uksun_k100$sun.Durham,
Oxford_sunshine = uksun_k100$sun.Oxford,
imputed = ifelse(is.na(uksun$sun.Durham), TRUE, FALSE))
ggplot(K5,aes(x=Oxford_sunshine,y=Durham_sunshine, color = imputed))+
geom_point()+ #draw the scatter plot
scale_color_manual(values=c("TRUE"="red","FALSE"= "black"))+ #the point imputed use color red, else use black
labs(title="Durham Sunshine vs Oxford Sunshine (K=5)",
x="Oxford Sunshine",
y="Durham Sunshine")+
theme_minimal()
ggplot(K100,aes(x=Oxford_sunshine,y=Durham_sunshine,color= imputed))+
geom_point()+
scale_color_manual(values=c("TRUE"="red","FALSE"= "black"))+
labs(title="Durham Sunshine vs Oxford Sunshine(K=100)",
x="Oxford Sunshine",
y="Durham Sunshine") +
theme_minimal()
#The imputed data points generally seem to follow the same trend as the real data. The Oxford sunshine hours increase, the Durham sunshine hours also increase for both imputed and real data. It is to be expected. The imputed values are estimated based on the values of the nearby real data points. So, it is reasonable that the imputed data would follow the overall trend of the real data.
library(skimr)
## Warning: 程序包'skimr'是用R版本4.4.3 来建造的
##
## 载入程序包:'skimr'
## The following object is masked from 'package:naniar':
##
## n_complete
skim(woops) #makea summary of woops by summarizing key statistics
Name | woops |
Number of rows | 1024 |
Number of columns | 4 |
_______________________ | |
Column type frequency: | |
numeric | 4 |
________________________ | |
Group variables | None |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
id | 0 | 1 | 512.50 | 295.75 | 1.00 | 256.75 | 512.50 | 768.25 | 1024.00 | ▇▇▇▇▇ |
gene1 | 0 | 1 | 0.00 | 1.00 | -3.34 | -0.69 | 0.00 | 0.65 | 3.44 | ▁▅▇▃▁ |
gene2 | 0 | 1 | 0.00 | 0.97 | -2.86 | -0.68 | 0.00 | 0.65 | 2.96 | ▁▅▇▃▁ |
gene3 | 0 | 1 | -0.04 | 0.71 | -1.98 | -0.53 | -0.04 | 0.44 | 2.10 | ▁▅▇▃▁ |
woops|>
pivot_longer(cols=starts_with("gene"),names_to="Gene",values_to="value")|> #convert the wide-format data into long-format
ggplot(aes(x=id,y=value,color=Gene))+
geom_point()+ #draw the scatter plot
theme_minimal()
woops|>
pivot_longer(cols=starts_with("gene"),names_to="Gene",values_to="value")|>
ggplot(aes(x=id,y=value,color=Gene))+
geom_boxplot()+ #draw the box-plot
theme_minimal()
#calculate the Q1, Q3, IQR and range of outliners
find_outliers <- function(x) {
Q1 <- quantile(x, 0.25)
Q3 <- quantile(x, 0.75)
IQR <- Q3 - Q1
x < (Q1 - 1.5 * IQR) | x > (Q3 + 1.5 * IQR)
}
woops <- woops %>%
mutate(gene1_outlier = find_outliers(gene1),
gene2_outlier = find_outliers(gene2),
gene3_outlier = find_outliers(gene3)) #mark the outliers
woops_outliers <- woops %>%
filter(gene1_outlier | gene2_outlier | gene3_outlier) #select the line with outliner
print(woops_outliers) #print all lines with outliners
## # A tibble: 17 × 7
## id gene1 gene2 gene3 gene1_outlier gene2_outlier gene3_outlier
## <dbl> <dbl> <dbl> <dbl> <lgl> <lgl> <lgl>
## 1 46 -1.39 -2.86 2.06 FALSE TRUE TRUE
## 2 73 3.38 0.0717 -1.75 TRUE FALSE FALSE
## 3 225 3.29 0.750 -1.98 TRUE FALSE FALSE
## 4 241 3.04 -0.940 -0.999 TRUE FALSE FALSE
## 5 312 -3.34 -0.548 1.92 TRUE FALSE TRUE
## 6 416 0.267 -2.68 1.21 FALSE TRUE FALSE
## 7 433 -2.69 -1.20 2.00 TRUE FALSE TRUE
## 8 464 -2.72 0.0782 1.37 TRUE FALSE FALSE
## 9 466 -2.05 -1.79 1.93 FALSE FALSE TRUE
## 10 567 0.236 2.65 -1.46 FALSE TRUE FALSE
## 11 593 -1.61 -2.58 2.10 FALSE FALSE TRUE
## 12 687 3.15 -0.0738 -1.58 TRUE FALSE FALSE
## 13 699 3.44 -1.11 -1.15 TRUE FALSE FALSE
## 14 723 -3.28 0.729 1.27 TRUE FALSE FALSE
## 15 766 -2.84 0.0531 1.36 TRUE FALSE FALSE
## 16 814 -0.121 2.96 -1.44 FALSE TRUE FALSE
## 17 946 -2.77 0.159 1.27 TRUE FALSE FALSE
library(plotly)
## Warning: 程序包'plotly'是用R版本4.4.3 来建造的
##
## 载入程序包:'plotly'
## The following object is masked from 'package:Hmisc':
##
## subplot
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(dplyr)
library(MASS)
##
## 载入程序包:'MASS'
## The following object is masked from 'package:plotly':
##
## select
## The following object is masked from 'package:patchwork':
##
## area
## The following object is masked from 'package:dplyr':
##
## select
gene_data <- woops[, c("gene1", "gene2", "gene3")]
#calculate the parameter used to calculate Mahalanobis distance
cov_matrix <- cov(gene_data)
inv_cov <- ginv(cov_matrix)
#calculate Mahalanobis distance
mahal_dist <- apply(gene_data, 1, function(row) {
diff <- row - colMeans(gene_data)
sqrt(t(diff) %*% inv_cov %*% diff)
})
outlier_row <- which.max(mahal_dist)#choose the line with the most Mahalanobis distance
print(woops[outlier_row, ]) #print that outlier row
## # A tibble: 1 × 7
## id gene1 gene2 gene3 gene1_outlier gene2_outlier gene3_outlier
## <dbl> <dbl> <dbl> <dbl> <lgl> <lgl> <lgl>
## 1 347 2.36 -0.636 0.243 FALSE FALSE FALSE
#After I do the first {r}, I found I cannot find the only row with outlier. Reading the outliner, I find several row which was contaminated. And I want to find it. Maybe it’s because there are three parameter, perhaps 3D scatter plot can easy to find it. So I ask online, and study how to do that. Even though I don’t write the code on the ass1, I already know a new way to draw plot. In this period, I also learn Mahalanobis distance, which is a useful concept. Finally I get the answer by using Mahalanobis distance. But I still want to know how to do that using the knowledge. Would you please explain it in future classes? I’m grateful whatever yes or no, thank you for read this.
WRITE DOWN WHICH SOURCES OF INFORMATION YOU USED, AND REFLECT ON HOW IT WAS USED #First, I absolutely use the notes from the classes. And I also ask my classmate exercise 3. Like how to choose the data be contaminated. Unfortunately, she do the same way and choose the outliner by observing. So I ask on the Internet. And I also write the notes on my notebook. What’s more, I choose geom_col in 2.1. I know it ask us to produce a histogram, but I think geom_col can do that, and it was taught in class. Furthermore, I had a lot of tedious steps in many places. So I had to do some steps to make it seems like the example chart. Like patchwork.