161.777 Practical Data Mining

# Add any other packages you need to load here.
library(tidyverse)

# Read in the data
uksun <- read_csv("https://www.massey.ac.nz/~jcmarsha/161777/assessment/data/uksun.csv")
woops <- read_csv("https://www.massey.ac.nz/~jcmarsha/161777/assessment/data/woops2025.csv")

Exercise 1: Exploratory analysis of UK weather records

1.1. Missing values

library(naniar)  #load the naniar package

## Warning: 程序包'naniar'是用R版本4.4.3 来建造的

uksun|> #read dataset
  gg_miss_upset() #visualize missing data

## `geom_line()`: Each group consists of only one observation.
## ℹ Do you need to adjust the group aesthetic?

#sun.Durham and sun.Sheffield have missing data #sun.Durham has 159 points missing #sun.Sheffield has 1 point missing

1.2. Mean sunshine hours

library(Hmisc) #load the Hmisc package

## Warning: 程序包'Hmisc'是用R版本4.4.3 来建造的

## 
## 载入程序包：'Hmisc'

## The following objects are masked from 'package:dplyr':
## 
##     src, summarize

## The following objects are masked from 'package:base':
## 
##     format.pval, units

uksun|>#read dataset
  mutate(sun.Sheffield=impute_mean(sun.Sheffield),sun.Durham=impute_mean(sun.Durham))|> #replace the missing values in sun.Durham and sun.Sheffield with the mean of the column
  summarise(mean_sun.Oxford=mean(sun.Oxford),
            mean_sun.Sheffield=mean(sun.Sheffield),
            mean_sun.Durham=mean(sun.Durham)) #calculate the mean of sun.Oxford, the imputed sun.Sheffield and sun.Durham columns.

## # A tibble: 1 × 3
##   mean_sun.Oxford mean_sun.Sheffield mean_sun.Durham
##             <dbl>              <dbl>           <dbl>
## 1            128.               111.            112.

#sun.Oxford has the highest mean sunshine hours #sun.Sheffield has the lowest mean sunshine hours

1.3. Mean rainfall per month

uksun_monthly_avg1 <- uksun|>  #given data from uksun to uksun_monthly_avg1
  select(-sun.Durham,-sun.Oxford,-sun.Sheffield) |> #removes the columns sun.Durham, sun.Oxford and sun.Sheffield from the dataset
  group_by(month)|> #group the dataset by the month column
  summarise(rain_Oxford=mean(rain.Oxford,na.rm=TRUE),
    rain_Durham=mean(rain.Durham, na.rm=TRUE),
    rain_Sheffield=mean(rain.Sheffield,na.rm=TRUE)) #calculate the mean of rainfall for each city, and ensure the missing values are ignored during calculation
uksun_monthly_avg_long1 <- uksun_monthly_avg1 |> #given data from uksun_monthly_avg1 to uksun_monthly_avg_long1
  pivot_longer(cols=starts_with("rain"), 
               names_to="city", 
               values_to="rain")|> #convert the wide-format data into long-format
  select(month, city, rain) #keep month, city and rain columns
print(uksun_monthly_avg1) #print the dataset

## # A tibble: 12 × 4
##    month rain_Oxford rain_Durham rain_Sheffield
##    <dbl>       <dbl>       <dbl>          <dbl>
##  1     1        57.6        56.9           82.0
##  2     2        41.9        45.8           63.8
##  3     3        44.7        42.8           58.5
##  4     4        45.8        46.0           58.5
##  5     5        54.9        48.9           59.2
##  6     6        52.1        53.0           61.9
##  7     7        52.6        59.3           62.5
##  8     8        59.7        67.0           66.3
##  9     9        56.1        57.6           62.5
## 10    10        62.7        58.6           73.9
## 11    11        66.1        67.8           84.1
## 12    12        62.4        58.1           81.7

1.4. Reproduce the graphic

rainfall<-uksun|> #give the data from uksun to rainfall
  select(year,month,rain.Durham,rain.Oxford,rain.Sheffield)|> #choose year,month,rain.Durham,rain.Oxford,rain.Sheffield columns
  pivot_longer(cols = starts_with("rain"), 
               names_to = "city", 
               values_to = "value")|> #convert the wide-format data into long-format
  mutate(month = factor(month, levels = 1:12, labels = month.abb)) #transform 1-12 into Jan-Dec
rainfall_plot<-rainfall|> #give the data from rainfall to rainfall_plot
  ggplot(aes(x = month, y = value, group = year))+ #setup basic ggplot
  geom_line(color = "black", alpha = 0.3)+ #add individual lines
  facet_wrap(~city,ncol=1,labeller = labeller(city = function(x) gsub("rain.", "", x)))+ #create multiple subplots facet by city
  geom_smooth(aes(group = city, color = city), method = "loess", se = FALSE, size = 1, span = 0.5)+ #add smoothed trend lines
  scale_color_manual(values = c("rain.Durham" = "steelblue", "rain.Oxford" = "brown", "rain.Sheffield" = "tan"))+ #choose the lines' colors
  scale_y_continuous(breaks = seq(0, 200, by = 100),
                     limits = c(0, 200)) + #adjust Y-axis scale
  labs(x = "", y = "Monthly rainfall") + #add axis labels
  theme_minimal() + #remove extra grid lines and background
  theme(axis.text.x = element_text(angle = 45, hjust = 1), 
        strip.position = "top",
        legend.position = "none") #decide plot appearance

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

sunshine<-uksun|> #same like rainfall
  select(year,month,sun.Durham,sun.Oxford,sun.Sheffield)|>
  pivot_longer(cols = starts_with("sun"), 
               names_to = "city", 
               values_to = "value")|>
  mutate(month = factor(month, levels = 1:12, labels = month.abb))
sunshine_plot<-sunshine|>
  ggplot(aes(x = month, y = value, group = year))+
  geom_line(color = "black", alpha = 0.3)+
  facet_wrap(~city,ncol=1,labeller = labeller(city = function(x) gsub("rain.", "", x)))+
  geom_smooth(aes(group = city, color = city), method = "loess", se = FALSE, size = 1, span = 0.5)+
  scale_color_manual(values = c("sun.Durham" = "steelblue", "sun.Oxford" = "brown", "sun.Sheffield" = "tan"))+
  scale_y_continuous(breaks = seq(0, 300, by = 100),
                     limits = c(0, 300)) +
  labs(x = "", y = "Monthly sunshine") + 
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),  
        strip.position = "top", 
        legend.position = "none" )

library(patchwork)

## Warning: 程序包'patchwork'是用R版本4.4.3 来建造的

rainfall_plot+sunshine_plot #combine rainfall_plot and sunshine_plot

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 5 rows containing non-finite outside the scale range
## (`stat_smooth()`).

## Warning in plot_theme(plot): The `strip.position` theme element is not defined
## in the element hierarchy.

## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 163 rows containing non-finite outside the scale range
## (`stat_smooth()`).

## Warning in plot_theme(plot): The `strip.position` theme element is not defined
## in the element hierarchy.

## Warning: Removed 156 rows containing missing values or values outside the scale range
## (`geom_line()`).

Exercise 2: Imputation of UK weather records

2.1. Histogram of mean imputed July sunshine hours in Durham city

uksun|> #read dataset
  filter(month=="7")|> #select rows
  select(year,sun.Durham)|> #keep year and sun.Durham columns
  mutate(missing = is.na(sun.Durham),
         sun.Durham=impute_mean(sun.Durham))|> #create a new column missing to indicate whether sun.Durham is NA, and imputes missing values in sun.Durham using the mean
  ggplot(aes(x = year, y = sun.Durham, fill = missing)) + #setup basic ggplot  
  geom_col(color = "black", alpha = 0.7) + #creat a bar chart 
  labs(title = "Durham Sunshine in July", 
       x = "Year", 
       y = "Sunshine (hours)") + #set labs
  scale_fill_manual(values = c("TRUE" = "brown", "FALSE" = "steelblue")) +  #choose color
  theme_minimal() #remove extra grid lines and background

2.2. Standard deviation before and after mean imputation

sunshine_before<-uksun|> #give the data from uksun to sunshine_before
  filter(month=="7")|> #select rows
  select(year,sun.Durham)|> #keep year and sun.Durham columns
  filter(!is.na(sun.Durham)) #exclude rows where has missing (NA) values
sunshine_before_plot <- sunshine_before|> #give the data from sunshine_before to sunshine_before_plot
  ggplot(aes(y=sun.Durham))+ #setup basic ggplot
  geom_boxplot(fill = "steelblue",alpha=0.7)+ #create box-plot, and choose color
  labs(title = "Before Imputation", 
       y = "Sunshine Hours") +
  labs(subtitle="Durham Sunshine in July")+ #set labs
  theme_minimal() #remove extra grid lines and background

sunshine_after<-uksun|> #same like sunshine_before
  filter(month == "7")|>
  select(year, sun.Durham)|>
  mutate(sun.Durham=ifelse(is.na(sun.Durham),mean(sun.Durham, na.rm = TRUE), sun.Durham))
sunshine_after_plot <- sunshine_after|>
  ggplot(aes(y=sun.Durham))+
  geom_boxplot(fill = "brown",alpha=0.7)+
  labs(title = "After Imputation", 
       y = "Sunshine Hours") +
  labs(subtitle="Durham Sunshine in July")+
  theme_minimal()

library(patchwork)
sunshine_before_plot+sunshine_after_plot #combine sunshine_before_plot and sunshine_after_plot

#The dataset “Before Imputation” has a greater degree of dispersion. Judging from the interquartile range (IQR), the box length of “Before Imputation” is relatively longer, indicating that the middle 50% of the data in this dataset is more dispersed. In terms of outliers, in the “Before Imputation” boxplot, the outlier points are far from the box, suggesting that there are some extreme values deviating from the main body of the data. Although the “After Imputation” dataset also has outliers, the data as a whole is relatively more concentrated around the box.

2.3. k-Nearest neighbour imputation scatterplots

library(VIM)

## Warning: 程序包'VIM'是用R版本4.4.3 来建造的

## 载入需要的程序包：colorspace

## 载入需要的程序包：grid

## VIM is ready to use.

## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues

## 
## 载入程序包：'VIM'

## The following object is masked from 'package:datasets':
## 
##     sleep

uksun_k5<-uksun|>#give data from uksun to uksun_k5
  select(-year,-month)|> #remove year and month columns
  kNN(k=5) #calculate the mean of 5 the most similar datas to fill NA

uksun_k100<-uksun|>
  select(-year,-month)|>
  kNN(k=100)

# extract interpolated sunshine data for Durham and Oxford
K5<- data.frame(Durham_sunshine = uksun_k5$sun.Durham,
                Oxford_sunshine = uksun_k5$sun.Oxford,
                imputed = ifelse(is.na(uksun$sun.Durham), TRUE, FALSE))

K100<-data.frame(Durham_sunshine = uksun_k100$sun.Durham,
                 Oxford_sunshine = uksun_k100$sun.Oxford,
                 imputed = ifelse(is.na(uksun$sun.Durham), TRUE, FALSE))

ggplot(K5,aes(x=Oxford_sunshine,y=Durham_sunshine, color = imputed))+
  geom_point()+ #draw the scatter plot
  scale_color_manual(values=c("TRUE"="red","FALSE"= "black"))+ #the point imputed use color red, else use black
  labs(title="Durham Sunshine vs Oxford Sunshine (K=5)", 
       x="Oxford Sunshine",
       y="Durham Sunshine")+
  theme_minimal()

ggplot(K100,aes(x=Oxford_sunshine,y=Durham_sunshine,color= imputed))+
  geom_point()+
  scale_color_manual(values=c("TRUE"="red","FALSE"= "black"))+
  labs(title="Durham Sunshine vs Oxford Sunshine(K=100)", 
       x="Oxford Sunshine",
       y="Durham Sunshine") +
  theme_minimal()

#The imputed data points generally seem to follow the same trend as the real data. The Oxford sunshine hours increase, the Durham sunshine hours also increase for both imputed and real data. It is to be expected. The imputed values are estimated based on the values of the nearby real data points. So, it is reasonable that the imputed data would follow the overall trend of the real data.

Exercise 3: Woops!

library(skimr)

## Warning: 程序包'skimr'是用R版本4.4.3 来建造的

## 
## 载入程序包：'skimr'

## The following object is masked from 'package:naniar':
## 
##     n_complete

skim(woops) #makea summary of woops by summarizing key statistics

Data summary
Name	woops
Number of rows	1024
Number of columns	4
_______________________
Column type frequency:
numeric	4
________________________
Group variables	None

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
id	1	512.50	295.75	1.00	256.75	512.50	768.25	1024.00	▇▇▇▇▇
gene1	1	0.00	1.00	-3.34	-0.69	0.00	0.65	3.44	▁▅▇▃▁
gene2	1	0.00	0.97	-2.86	-0.68	0.00	0.65	2.96	▁▅▇▃▁
gene3	1	-0.04	0.71	-1.98	-0.53	-0.04	0.44	2.10	▁▅▇▃▁

woops|>
  pivot_longer(cols=starts_with("gene"),names_to="Gene",values_to="value")|> #convert the wide-format data into long-format
  ggplot(aes(x=id,y=value,color=Gene))+
  geom_point()+ #draw the scatter plot
  theme_minimal()

woops|>
  pivot_longer(cols=starts_with("gene"),names_to="Gene",values_to="value")|>
  ggplot(aes(x=id,y=value,color=Gene))+
  geom_boxplot()+ #draw the box-plot
  theme_minimal()

#calculate the Q1, Q3, IQR and range of outliners
find_outliers <- function(x) {
  Q1 <- quantile(x, 0.25)
  Q3 <- quantile(x, 0.75)
  IQR <- Q3 - Q1
  x < (Q1 - 1.5 * IQR) | x > (Q3 + 1.5 * IQR)
}

woops <- woops %>%
  mutate(gene1_outlier = find_outliers(gene1),
         gene2_outlier = find_outliers(gene2),
         gene3_outlier = find_outliers(gene3)) #mark the outliers

woops_outliers <- woops %>%
  filter(gene1_outlier | gene2_outlier | gene3_outlier) #select the line with outliner

print(woops_outliers)  #print all lines with outliners

## # A tibble: 17 × 7
##       id  gene1   gene2  gene3 gene1_outlier gene2_outlier gene3_outlier
##    <dbl>  <dbl>   <dbl>  <dbl> <lgl>         <lgl>         <lgl>        
##  1    46 -1.39  -2.86    2.06  FALSE         TRUE          TRUE         
##  2    73  3.38   0.0717 -1.75  TRUE          FALSE         FALSE        
##  3   225  3.29   0.750  -1.98  TRUE          FALSE         FALSE        
##  4   241  3.04  -0.940  -0.999 TRUE          FALSE         FALSE        
##  5   312 -3.34  -0.548   1.92  TRUE          FALSE         TRUE         
##  6   416  0.267 -2.68    1.21  FALSE         TRUE          FALSE        
##  7   433 -2.69  -1.20    2.00  TRUE          FALSE         TRUE         
##  8   464 -2.72   0.0782  1.37  TRUE          FALSE         FALSE        
##  9   466 -2.05  -1.79    1.93  FALSE         FALSE         TRUE         
## 10   567  0.236  2.65   -1.46  FALSE         TRUE          FALSE        
## 11   593 -1.61  -2.58    2.10  FALSE         FALSE         TRUE         
## 12   687  3.15  -0.0738 -1.58  TRUE          FALSE         FALSE        
## 13   699  3.44  -1.11   -1.15  TRUE          FALSE         FALSE        
## 14   723 -3.28   0.729   1.27  TRUE          FALSE         FALSE        
## 15   766 -2.84   0.0531  1.36  TRUE          FALSE         FALSE        
## 16   814 -0.121  2.96   -1.44  FALSE         TRUE          FALSE        
## 17   946 -2.77   0.159   1.27  TRUE          FALSE         FALSE

library(plotly)

## Warning: 程序包'plotly'是用R版本4.4.3 来建造的

## 
## 载入程序包：'plotly'

## The following object is masked from 'package:Hmisc':
## 
##     subplot

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

library(dplyr)
library(MASS)

## 
## 载入程序包：'MASS'

## The following object is masked from 'package:plotly':
## 
##     select

## The following object is masked from 'package:patchwork':
## 
##     area

## The following object is masked from 'package:dplyr':
## 
##     select

gene_data <- woops[, c("gene1", "gene2", "gene3")]

#calculate the parameter used to calculate Mahalanobis distance
cov_matrix <- cov(gene_data)
inv_cov <- ginv(cov_matrix)

#calculate Mahalanobis distance
mahal_dist <- apply(gene_data, 1, function(row) {
  diff <- row - colMeans(gene_data)
  sqrt(t(diff) %*% inv_cov %*% diff)
})
outlier_row <- which.max(mahal_dist)#choose the line with the most Mahalanobis distance
print(woops[outlier_row, ]) #print that outlier row

## # A tibble: 1 × 7
##      id gene1  gene2 gene3 gene1_outlier gene2_outlier gene3_outlier
##   <dbl> <dbl>  <dbl> <dbl> <lgl>         <lgl>         <lgl>        
## 1   347  2.36 -0.636 0.243 FALSE         FALSE         FALSE

#After I do the first {r}, I found I cannot find the only row with outlier. Reading the outliner, I find several row which was contaminated. And I want to find it. Maybe it’s because there are three parameter, perhaps 3D scatter plot can easy to find it. So I ask online, and study how to do that. Even though I don’t write the code on the ass1, I already know a new way to draw plot. In this period, I also learn Mahalanobis distance, which is a useful concept. Finally I get the answer by using Mahalanobis distance. But I still want to know how to do that using the knowledge. Would you please explain it in future classes? I’m grateful whatever yes or no, thank you for read this.

Exercise 4: Reflection

WRITE DOWN WHICH SOURCES OF INFORMATION YOU USED, AND REFLECT ON HOW IT WAS USED #First, I absolutely use the notes from the classes. And I also ask my classmate exercise 3. Like how to choose the data be contaminated. Unfortunately, she do the same way and choose the outliner by observing. So I ask on the Internet. And I also write the notes on my notebook. What’s more, I choose geom_col in 2.1. I know it ask us to produce a histogram, but I think geom_col can do that, and it was taught in class. Furthermore, I had a lot of tedious steps in many places. So I had to do some steps to make it seems like the example chart. Like patchwork.