library(readxl)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.1     ✔ stringr   1.5.2
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(psych) # Load the psych library
## 
## Attaching package: 'psych'
## 
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(ggplot2)
library(effects)
## Loading required package: carData
## lattice theme set by effectsTheme()
## See ?effectsTheme for details.
# uploading the dataset
library(readxl)
dataLondon <- read_excel("~/Library/Mobile Documents/com~apple~CloudDocs/UNI/courses/year two/quan/QRMSEMINARS/assignment 1/dataLondon.xlsx")
#View(dataLondon)
# renaming data set
dataAssignment <- dataLondon
# check structure of dataset
str(dataAssignment)
## tibble [1,000 × 7] (S3: tbl_df/tbl/data.frame)
##  $ year            : num [1:1000] 202425 202425 202425 202425 202425 ...
##  $ country         : chr [1:1000] "England" "England" "England" "England" ...
##  $ region          : chr [1:1000] "London" "London" "London" "London" ...
##  $ localAuthority  : chr [1:1000] "City of London" "City of London" "Camden" "Camden" ...
##  $ schoolType      : chr [1:1000] "State-funded primary" "State-funded primary" "State-funded nursery" "State-funded nursery" ...
##  $ language        : chr [1:1000] "English" "EAL" "English" "EAL" ...
##  $ percentagePupils: num [1:1000] 39.8 60.2 85.3 14.7 47.5 ...
#correct types of variables
dataAssignment$year <- as.factor(dataAssignment$year)
dataAssignment$country <- as.factor(dataAssignment$country)
dataAssignment$region <- as.factor(dataAssignment$region)
dataAssignment$localAuthority <- as.factor(dataAssignment$localAuthority)
dataAssignment$schoolType <- as.factor(dataAssignment$schoolType)
dataAssignment$language <- as.factor(dataAssignment$language)
#recheck structure
str(dataAssignment)
## tibble [1,000 × 7] (S3: tbl_df/tbl/data.frame)
##  $ year            : Factor w/ 4 levels "202122","202223",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ country         : Factor w/ 1 level "England": 1 1 1 1 1 1 1 1 1 1 ...
##  $ region          : Factor w/ 1 level "London": 1 1 1 1 1 1 1 1 1 1 ...
##  $ localAuthority  : Factor w/ 33 levels "Barking and Dagenham",..: 7 7 6 6 6 6 6 6 6 6 ...
##  $ schoolType      : Factor w/ 4 levels "State-funded nursery",..: 2 2 1 1 2 2 3 3 4 4 ...
##  $ language        : Factor w/ 2 levels "EAL","English": 2 1 2 1 2 1 2 1 2 1 ...
##  $ percentagePupils: num [1:1000] 39.8 60.2 85.3 14.7 47.5 ...
#task one: LONDON - OVERALL (no school or year seperation)
#LINEGRAPH: EAL

# Calculate overall percentage for each language
overall_data <- dataAssignment %>%
  group_by(language) %>%
  summarise(overall_percentage = mean(percentagePupils, na.rm = TRUE))

# Create bar chart -> used in report
ggplot(overall_data, aes(x = language, y = overall_percentage, fill = language)) +
  geom_bar(stat = "identity", width = 0.6, show.legend = FALSE) +
  scale_y_continuous(limits = c(0, 100)) +
  labs(
    title = "Overall Percentage of Pupils in London by L1",
    x = "Language",
    y = "Percentage of Pupils"
  ) +
  theme_minimal(base_size = 14)

#boxplot

ggplot(dataAssignment, aes(x = language, y = percentagePupils, fill = language)) +
  geom_boxplot() +
  scale_y_continuous(limits = c(0, 100)) +
  labs(
    title = "1: Distribution of Pupil Percentages by L1",
    x = "Language",
    y = "Percentage of Pupils"
  ) +
  theme_minimal(base_size = 14) +
  theme(legend.position = "none")

#linegraph: DIVIDED BY YEARS

#calculate the mean percentage for each language per year
london_byYear <- dataAssignment %>%
  group_by(year, language) %>%
  summarise(mean_percentage = mean(percentagePupils, na.rm = TRUE))
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
# Then, plot the line graph
ggplot(london_byYear, aes(x = factor(year), y = mean_percentage, color = language, group = language)) +
  geom_line(size = 1.2) +
  geom_point(size = 3) +
  scale_y_continuous(limits = c(0, 100)) +
  labs(
    title = "Average Percentage of Pupils by L1 2021-2025",
    x = "Year",
    y = "Average Percentage of Pupils",
    color = "Language"
  ) +
  theme_minimal(base_size = 14)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

#analysis help for boxplot: MIN, MAX & MED
dataAssignment %>%
  group_by(language) %>%
  summarise(
    Min = min(percentagePupils, na.rm = TRUE),
    Mean = mean(percentagePupils, na.rm = TRUE),
    Max = max(percentagePupils, na.rm = TRUE))
#analysis help for boxplot: IQR RANGE
dataAssignment %>%
  group_by(language) %>%
  summarise(
    Q1 = quantile(percentagePupils, 0.25),
    Q3 = quantile(percentagePupils, 0.75)
  )
#Task one: LONDON

#LINEGRAPH: EAL

# Filter for London and EAL only
london_eal <- dataAssignment %>%
  filter(region == "London", language == "EAL") %>%
  group_by(year, schoolType) %>%
  summarise(meanPercentage = mean(percentagePupils, na.rm = TRUE)) %>%
  ungroup()
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
# Create the line graph
ggplot(london_eal, aes(x = factor(year), y = meanPercentage, color = schoolType, group = schoolType)) +
  geom_line(size = 1.2) +
  geom_point(size = 2) +
  scale_y_continuous(limits = c(20, 80), breaks = seq(20, 80, 10)) + #sets y axis
  labs(
    title = "2: Overview of Pupils with EAL by School Type in London",
    x = "Year",
    y = "Average Percentage of Pupils with EAL",
    color = "School Type"
  ) +
  theme_minimal()

#LINEGRAPH: WITHOUT EAL

# Filter for London and English only
london_English <- dataAssignment %>%
  filter(region == "London", language == "English") %>%
  group_by(year, schoolType) %>%
  summarise(meanPercentage = mean(percentagePupils, na.rm = TRUE)) %>%
  ungroup()
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
# Create the line graph
ggplot(london_English, aes(x = factor(year), y = meanPercentage, color = schoolType, group = schoolType)) +
  geom_line(size = 1.2) +
  geom_point(size = 2) +
  scale_y_continuous(limits = c(20, 80), breaks = seq(20, 80, 10)) + #sets y axis
  labs(
    title = "3: Overview of Pupils with English as their L1 by School Type in London",
    x = "Year",
    y = "Average Percentage of Pupils  with English as their L1",
    color = "School Type"
  ) +
  theme_minimal()

#task 2: GREENWICH -> WITH EAL

#BOXPLOT: EAL means of 2021-2025

# Filter for EAL and Greenwich
data_EALGreenwich<- dataAssignment%>%
  filter(language == "EAL", localAuthority == "Greenwich")

# Create the box plot
ggplot(data_EALGreenwich, aes(x = schoolType, y = percentagePupils)) +
  geom_boxplot(fill = "lightgreen", color = "black") +
  scale_y_continuous(limits = c(20, 80), breaks = seq(20, 80, 10)) + #sets y axis scale
  
  labs(
    title = "Distribution of Pupils using EAL in Greenwich by School Type",
    x = "School Type",
    y = "Percentage of Pupils"
  ) +
  theme_minimal()

#BARCHART: EAL means of 2021-2025
# Summarize data: calculate mean percentage for each school type
data_EALGreenwich_summary <- data_EALGreenwich %>%
  group_by(schoolType) %>%
  summarise(mean_percentage = mean(percentagePupils))

# Create the bar chart
ggplot(data_EALGreenwich_summary, aes(x = schoolType, y = mean_percentage)) +
  geom_col(fill = "lightgreen", color = "black") +
  scale_y_continuous(limits = c(0, 80), breaks = seq(0, 80, 10))

  labs(
    title = "Average Percentage of Pupils using EAL in Greenwich by School Type",
    x = "School Type",
    y = "Average Percentage of Pupils"
  ) +
  theme_minimal()
## NULL
#LINEGRAPH: EAL all years seperate 

# Filter for London, EAL and Greenwich only
data_EALGreenwich_mean <- dataAssignment %>%
  filter(region == "London", language == "EAL", localAuthority == "Greenwich") %>%
  group_by(year, schoolType) %>%
  summarise(meanPercentage = mean(percentagePupils, na.rm = TRUE)) %>%
  ungroup()
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
# Create the line graph -> used in report
ggplot(data_EALGreenwich_mean, aes(x = factor(year), y = meanPercentage, color = schoolType, group = schoolType)) +
  geom_line(size = 1.2) +
  geom_point(size = 2) +
  scale_y_continuous(limits = c(20, 80), breaks = seq(20, 80, 10)) +
  labs(
    title = "4: Overview of Pupils using EAL by School Type in Greenwich",
    x = "Year",
    y = "Average Percentage of Pupils using EAL",
    color = "School Type"
  ) +
  theme_minimal()

#task 2: GREENWICH -> WITHOUT EAL

#LINEGRAPH: English all years seperately

# Filter for London, English and Greenwich only
data_ENGGreenwich_mean <- dataAssignment %>%
  filter(region == "London", language == "English", localAuthority == "Greenwich") %>%
  group_by(year, schoolType) %>%
  summarise(meanPercentage = mean(percentagePupils, na.rm = TRUE)) %>%
  ungroup()
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
# Create the line graph
ggplot(data_ENGGreenwich_mean, aes(x = factor(year), y = meanPercentage, color = schoolType, group = schoolType)) +
  geom_line(size = 1.2) +
  geom_point(size = 2) +
  scale_y_continuous(limits = c(20, 80), breaks = seq(20, 80, 10)) +
  labs(
    title = "5: Overview of Pupils with English as their L1 by School Type in Greenwich",
    x = "Year",
    y = "Average Percentage of Pupils with English as their L1",
    color = "School Type"
  ) +
  theme_minimal()

#task 2: KENSINGTON AND CHELSEA -> EAL

#BOXPLOT: EAL means of 2021-2025

# Filter for EAL and K&C
KC_EAL<- dataAssignment%>%
  filter(language == "EAL", localAuthority == "Kensington and Chelsea")

# Create the box plot
ggplot(KC_EAL, aes(x = schoolType, y = percentagePupils)) +
  geom_boxplot(fill = "lightblue", color = "black") +
  scale_y_continuous(limits = c(20, 80), breaks = seq(20, 80, 10)) + #sets y axis scale
  labs(
    title = "Distribution of Pupils using EAL in Kensington & Chelsea by School Type",
    x = "School Type",
    y = "Percentage of Pupils"
  ) +
  theme_minimal()
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_boxplot()`).

#BARCHART: EAL means of 2021-2025
# Summarize data: calculate mean percentage for each school type
KC_EAL_summary <- KC_EAL %>%
  group_by(schoolType) %>%
  summarise(mean_percentage = mean(percentagePupils))

# Create the bar chart
ggplot(KC_EAL_summary, aes(x = schoolType, y = mean_percentage)) +
  geom_col(fill= "lightblue", color = "black") +
  scale_y_continuous(limits = c(0, 80), breaks = seq(0, 80, 10))

  labs(
    title = "Average Percentage of Pupils using EAL in Kensington & Chelsea by School Type",
    x = "School Type",
    y = "Average Percentage of Pupils"
  ) +
  theme_minimal()
## NULL
#LINEGRAPH: EAL all years seperate 

# Filter for London, EAL and Kensington&Chelsea only
KC_EAL_mean <- dataAssignment %>%
  filter(region == "London", language == "EAL", localAuthority == "Kensington and Chelsea") %>%
  group_by(year, schoolType) %>%
  summarise(meanPercentage = mean(percentagePupils, na.rm = TRUE)) %>%
  ungroup()
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
# Create the line graph -> used in report
ggplot(KC_EAL_mean, aes(x = factor(year), y = meanPercentage, color = schoolType, group = schoolType)) +
  geom_line(size = 1.2) +
  geom_point(size = 2) +
  scale_y_continuous(limits = c(15, 80), breaks = seq(15, 80, 10)) +
  labs(
    title = "6: Overview of Pupils using EAL by School Type in Kensington & Chelsea",
    x = "Year",
    y = "Average Percentage of Pupils using EAL",
    color = "School Type"
  ) +
  theme_minimal()

#task 2: KENSINGTON & CHELSEA -> WITHOUT EAL(ENG)

#LINEGRAPH: English all years seperately

# Filter for London, English and Kensington & Chealsea only
KC_ENG_mean <- dataAssignment %>%
  filter(region == "London", language == "English", localAuthority == "Kensington and Chelsea") %>%
  group_by(year, schoolType) %>%
  summarise(meanPercentage = mean(percentagePupils, na.rm = TRUE)) %>%
  ungroup()
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
# Create the line graph
ggplot(KC_ENG_mean, aes(x = factor(year), y = meanPercentage, color = schoolType, group = schoolType)) +
  geom_line(size = 1.2) +
  geom_point(size = 2) +
  scale_y_continuous(limits = c(20, 85), breaks = seq(20, 85, 10)) +
  labs(
    title = "7: Overview of EL1 Pupils by School Type in Kensington&Chelsea",
    x = "Year",
    y = "Average Percentage of Pupils with English as their L1",
    color = "School Type"
  ) +
  theme_minimal()

#task 2 : EALING -> WITH EAL


#BARCHART: EAL means of 2021-2025

# Filter for EAL and K&C
Ealing_EAL<- dataAssignment%>%
  filter(language == "EAL", localAuthority == "Ealing")

# Summarize data: calculate mean percentage for each school type
Ealing_EAL_summary <- Ealing_EAL %>%
  group_by(schoolType) %>%
  summarise(mean_percentage = mean(percentagePupils))

# Create the bar chart
ggplot(Ealing_EAL_summary, aes(x = schoolType, y = mean_percentage)) +
  geom_col(fill= "orange", color = "black") +
  scale_y_continuous(limits = c(0, 80), breaks = seq(0, 80, 10))

  labs(
    title = "Average Percentage of Pupils using EAL in Ealing by School Type",
    x = "School Type",
    y = "Average Percentage of Pupils"
  ) +
  theme_minimal()
## NULL
#LINEGRAPH: EAL all years seperate 

# Filter for London, EAL and Ealing only
Ealing_EAL_mean <- dataAssignment %>%
  filter(region == "London", language == "EAL", localAuthority == "Ealing") %>%
  group_by(year, schoolType) %>%
  summarise(meanPercentage = mean(percentagePupils, na.rm = TRUE)) %>%
  ungroup()
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
# Create the line graph -> used in report
ggplot(Ealing_EAL_mean, aes(x = factor(year), y = meanPercentage, color = schoolType, group = schoolType)) +
  geom_line(size = 1.2) +
  geom_point(size = 2) +
  scale_y_continuous(limits = c(20, 80), breaks = seq(20, 80, 10)) +
  labs(
    title = "8: Overview of Pupils with EAL by School Type in Ealing",
    x = "Year",
    y = "Average Percentage of Pupils using EAL",
    color = "School Type"
  ) +
  theme_minimal()

#task 2: EALING -> WITHOUT EAL (ENG)

#LINEGRAPH: English all years seperately

# Filter for London, English and Ealing only
Ealing_ENG_mean <- dataAssignment %>%
  filter(region == "London", language == "English", localAuthority == "Ealing") %>%
  group_by(year, schoolType) %>%
  summarise(meanPercentage = mean(percentagePupils, na.rm = TRUE)) %>%
  ungroup()
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
# Create the line graph
ggplot(Ealing_ENG_mean, aes(x = factor(year), y = meanPercentage, color = schoolType, group = schoolType)) +
  geom_line(size = 1.2) +
  geom_point(size = 2) +
  scale_y_continuous(limits = c(20, 80), breaks = seq(20, 80, 10)) +
  labs(
    title = "9: Overview of Pupils with English as their L1 by School Type in Ealing",
    x = "Year",
    y = "Average Percentage of Pupils with English as their L1",
    color = "School Type"
  ) +
  theme_minimal()

Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Cmd+Option+I.