Final Project

library("ggplot2")
library("dplyr")
library("tinytex")
library("tidyverse")
library("rmarkdown")
library("knitr")
library("magrittr")
library("gt")
setwd("C:/Users/vough/OneDrive/Desktop/Rstudio")
load("phil_pisa2018.Rda")

2-6

phil_pisa20181 <- phil_pisa2018 %>% 
  mutate(REGION_NEW = case_when(STRATUM == "PHL0001" ~ "Region 1",
                            STRATUM == "PHL0002" ~ "Region 2",
                            STRATUM == "PHL0003" ~ "Region 3",
                            STRATUM == "PHL0004" ~ "Region 4A",
                            STRATUM == "PHL0005" ~ "Region 4B",
                            STRATUM == "PHL0006" ~ "Region 5",
                            STRATUM == "PHL0007" ~ "Region 6",
                            STRATUM == "PHL0008" ~ "Region 7",
                            STRATUM == "PHL0009" ~ "Region 8",
                            STRATUM == "PHL0010" ~ "Region 9",
                            STRATUM == "PHL0011" ~ "Region 10",
                            STRATUM == "PHL0012" ~ "Region 11",
                            STRATUM == "PHL0013" ~ "Region 12",
                            STRATUM == "PHL0014" ~ "NCR",
                            STRATUM == "PHL0015" ~ "CAR",
                            STRATUM == "PHL0016" ~ "CARAGA",
                            STRATUM == "PHL0017" ~ "NIR"),
         MATH = rowMeans(select(phil_pisa2018, PV1MATH,PV2MATH,PV3MATH,PV4MATH,PV5MATH,PV6MATH,PV7MATH,PV8MATH,PV9MATH,PV10MATH)),
         READING = rowMeans(select(phil_pisa2018, PV1READ,PV2READ,PV3READ,PV4READ,PV5READ,PV6READ,PV7READ,PV8READ,PV9READ,PV10READ)),
         SCIENCE = rowMeans(select(phil_pisa2018, PV1SCIE,PV2SCIE,PV3SCIE,PV4SCIE,PV5SCIE,PV6SCIE,PV7SCIE,PV8SCIE,PV9SCIE,PV10SCIE)),
         GLCM = rowMeans(select(phil_pisa2018, PV1GLCM,PV2GLCM,PV3GLCM,PV4GLCM,PV5GLCM,PV6GLCM,PV7GLCM,PV8GLCM,PV9GLCM,PV10GLCM)))

head(phil_pisa20181,10)[,1119:1123]

##    REGION_NEW     MATH  READING  SCIENCE     GLCM
## 1   Region 11 382.5703 365.3463 375.1763 314.9784
## 2   Region 11 430.8153 453.7995 478.0337 515.6548
## 3   Region 11 329.4409 339.1177 341.1251 388.0567
## 4   Region 11 407.1276 404.1598 389.5056 395.9010
## 5   Region 11 364.8726 337.6048 385.4762 356.3659
## 6   Region 11 451.0306 416.0747 406.5355 386.8978
## 7   Region 11 375.0757 351.0921 399.4590 345.6378
## 8   Region 11 412.3824 374.0752 423.3404 393.6071
## 9   Region 11 463.4672 453.2416 427.6998 439.9309
## 10  Region 11 434.1432 413.8357 407.9416 421.4672

phil_pisa20181$REGION_NEW = as.factor(phil_pisa20181$REGION_NEW)
phil_pisa20181$ST004D01T <- as.factor(phil_pisa20181$ST004D01T)
phil_pisa20181 %>% 
  select(REGION_NEW, ST004D01T) %>% 
  group_by(REGION_NEW, ST004D01T) %>% 
  summarize(count=n()) %>%
  ggplot(aes(x = reorder(REGION_NEW, count), y = count, fill = ST004D01T)) +
  geom_bar(stat = "identity", position = position_dodge(0.7)) +
  coord_flip() +
  labs(x = "Region", y= "Number", title = "Number of participants per region") +
  theme(plot.title = element_text(hjust = 0.5)) +
  scale_x_discrete(expand = c(0,0), labels = c("CAR","CARAGA","10","9","4B","2","12","NIR","11","8","1","6","7","5","3","NCR","4A")) +
  scale_y_continuous(expand= c(0,0), limits = c(0,630)) +
  theme_test() +
  scale_fill_discrete(name = "Sex", labels = c("Female", "Male")) +
  geom_text(aes(label=count), hjust=0.1, position = position_dodge(0.9), size=1.8)

The region which has the least number of students in the sample is CAR while the most number of students are from region 4A. It is also evident that there are more female participants than male participants for every region in the sample.

phil_pisa20181 %>% 
  select(ST001D01T, ST004D01T) %>% 
  group_by(ST001D01T, ST004D01T) %>% 
  summarize(count=n()) %>%
  ggplot(aes(x = ST001D01T, y = count, fill = ST004D01T)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(x= "Grade level", y = "Number") +
  scale_x_continuous(expand = c(0,0)) +
  scale_y_continuous(expand= c(0,0), limits = c(0,2100)) +
  theme_gray() +
  scale_fill_discrete(name = "Sex", labels = c("Female", "Male")) +
  geom_text(aes(label=count), vjust=-0.2, position = position_dodge(0.9), size=3.5)

The Grade 12 level has the least number of students in the sample while most number of students are Grade 9 with female outnumbering male participants from this grade level.

phil_pisa20181$REGION_NEW <- as.factor(phil_pisa20181$REGION_NEW)
phil_pisa20181 %>%
  select(ST011Q06TA, REGION_NEW) %>% 
  filter(ST011Q06TA == 1) %>% 
  group_by(REGION_NEW) %>% 
  summarize(count=n()) %>%
  ggplot(aes(x = reorder(REGION_NEW,count), y = count)) +
  geom_bar(stat = "identity", position = "dodge", width = 0.65) +
  coord_flip() +
  labs(x= "Region", y = "Number", title = "Students with internet access") +
  theme(plot.title = element_text(hjust = 0.5)) +
  scale_x_discrete(expand = c(0,0), labels = c("CARAGA","CAR","10","9", "2", "12","4B","NIR", "11", "5", "6","8","1","7","3","NCR","4A")) +
  scale_y_continuous(expand= c(0,0), limits = c(0,700)) +
  theme_get() +
  geom_text(aes(label=count), hjust= -0.2, position = position_dodge(0.9), size=3.25)

The region which has the most number of students who have internet access is Region 4A while CARAGA has the least number of students who have internet connection.

10.

phil_pisa20181 %>% 
ggplot(aes(MATH)) + 
    geom_histogram(aes(y=after_stat(density)),
                   colour="black", 
                   fill="light blue",
                   bins = 140) +
  stat_function(fun = dnorm, args = list(mean=mean(phil_pisa20181$MATH),sd = sd(phil_pisa20181$MATH))) +
  scale_x_continuous(expand = c(0,0)) +
  scale_y_continuous(expand= c(0,0), limits = c(0, 0.007)) +
  theme_get() +
  labs(x="Mean values of Mathematics")

The data for the mean values of Mathematics form an almost perfectly symmetrical bell shape, which suggests that the distribution is approximately normally distributed. The graph also does not indicate any conspicuous outlier.

phil_pisa20181 %>% 
ggplot(aes(READING)) + 
    geom_histogram(aes(y=after_stat(density)),
                   colour="black", 
                   fill="gray",
                   bins = 140) +
   stat_function(fun = dnorm, args = list(mean=mean(phil_pisa20181$READING),sd = sd(phil_pisa20181$READING))) +
  scale_x_continuous(expand = c(0,0)) +
  scale_y_continuous(expand= c(0,0), limits = c(0, 0.007)) +
  theme_get() +
  labs(x= "Mean values of Reading")

The data for the mean values of Reading has a distribution which is skewed to the right/positively skewed, which indicates that the distribution is not normally distributed. The graph also shows outliers on the right side of the distribution.

phil_pisa20181 %>% 
ggplot(aes(SCIENCE)) + 
    geom_histogram(aes(y=after_stat(density)),
                   colour="black", 
                   fill="light green",
                   bins = 140) +
  stat_function(fun = dnorm, args = list(mean=mean(phil_pisa20181$SCIENCE),sd = sd(phil_pisa20181$SCIENCE))) +
  scale_x_continuous(expand = c(0,0)) +
  scale_y_continuous(expand= c(0,0), limits = c(0, 0.008)) +
  theme_get() +
  labs(x="Mean values of Science")

The data for the mean values of Science has a distribution which is skewed to the right/positively skewed, which suggest that the distribution is not normally distributed. The graph also indicate outliers that can be seen on the right side of the distribution.

phil_pisa20181 %>% 
ggplot(aes(GLCM)) + 
    geom_histogram(aes(y=after_stat(density)),
                   colour="black", 
                   fill="light pink",
                   bins = 140) +
  stat_function(fun = dnorm, args = list(mean=mean(phil_pisa20181$GLCM),sd = sd(phil_pisa20181$GLCM))) +
  scale_x_continuous(expand = c(0,0)) +
  scale_y_continuous(expand= c(0,0), limits = c(0, 0.008)) +
  theme_get() +
  labs(x="Mean values of Global competency")

The data for the mean values of Global competency has a distribution which is skewed to the right/positively skewed, which further suggests that the distribution is not normally distributed. Furthermore, the graph indicates outliers that can be found on the right side of the distribution.

11.

std.error <- function(x) sd(x)/sqrt(length(x))
tab <- phil_pisa20181 %>% 
  select(REGION_NEW,MATH,READING,SCIENCE,GLCM) %>% 
  group_by(REGION_NEW) %>% 
  summarize(mean_math = round(mean(MATH),2), se_math = round(std.error(MATH),2),
            mean_read = round(mean(READING),2), se_read = round(std.error(READING),2),
            mean_scie = round(mean(SCIENCE),2),se_scie = round(std.error(SCIENCE),2),
            mean_glcm = round(mean(GLCM),2),se_glcm = round(std.error(GLCM),2)) %>%
  gt() %>% 
  tab_spanner(label = "MATH", columns = c(mean_math,se_math)) %>%
  tab_spanner(label = "READING", columns = c(mean_read,se_read)) %>% 
  tab_spanner(label = "SCIENCE", columns = c(mean_scie,se_scie)) %>% 
  tab_spanner(label = "GLCM", columns = c(mean_glcm,se_glcm)) %>% 
  cols_label(mean_math="Mean",se_math="Std. error",
             mean_read="Mean",se_read="Std. error",
             mean_scie="Mean",se_scie="Std. error",
             mean_glcm="Mean",se_glcm="Std. error",
             REGION_NEW="Region") %>% 
  cols_align(align="center", columns = everything())
tab

Region	MATH		READING		SCIENCE		GLCM
Region	Mean	Std. error	Mean	Std. error	Mean	Std. error	Mean	Std. error
CAR	369.81	6.81	354.32	6.87	361.32	6.52	383.18	7.00
CARAGA	305.29	4.45	299.52	4.04	317.17	3.64	333.19	3.99
NCR	384.93	2.39	371.51	2.79	390.47	2.45	401.89	2.74
NIR	342.31	4.36	335.84	4.66	359.89	4.43	369.55	4.57
Region 1	341.89	3.19	321.00	3.31	340.79	3.03	353.36	3.28
Region 10	342.00	4.24	330.77	4.59	343.07	3.96	353.60	4.77
Region 11	342.13	3.45	331.78	3.94	349.39	3.56	362.89	3.95
Region 12	313.70	3.69	297.41	3.47	326.32	3.09	339.41	3.30
Region 2	344.65	4.17	326.65	4.10	346.87	3.83	363.06	4.17
Region 3	355.46	2.34	339.04	2.44	350.31	2.23	367.95	2.35
Region 4A	365.44	1.92	349.82	2.15	365.88	2.01	380.04	2.12
Region 4B	337.20	4.35	326.54	4.29	339.27	3.96	354.98	4.30
Region 5	338.22	3.36	327.47	3.38	345.08	3.09	363.74	3.32
Region 6	345.21	3.96	333.86	4.10	354.29	3.68	367.23	3.84
Region 7	364.03	3.27	353.46	3.67	375.31	3.15	386.83	3.64
Region 8	341.16	4.00	343.50	4.26	351.39	3.97	370.22	4.16
Region 9	329.98	4.32	309.24	4.04	331.43	3.77	345.76	4.04

Based from the above table, it is evident that NCR performs best on the three subject areas; Mathematics, Reading, and Science among the 17 regions. On the other hand, CARAGA performs worst for both Mathematics and Science and Region 12 has the least mean value for the subject area Reading. The table also shows that NCR has the highest mean value in Global competency while CARAGA performs worst. In addition, the computed standard error of mean for NCR is less than CARAGA’s, which implies that the computed mean for NCR in global competency is a better or more reliable estimate of the true mean than of CARAGA’s.

12.

phil_pisa20181 %>% 
  select(MATH,READING,SCIENCE,GLCM,ST004D01T) %>% 
  drop_na(MATH,READING,SCIENCE,GLCM,ST004D01T) %>% 
  group_by(ST004D01T) %>% 
  summarize(mean1= round(mean(MATH),1),s1=sd(MATH), mean2= round(mean(READING),1), s2=sd(READING),mean3 = round(mean(SCIENCE),1),s3=sd(SCIENCE), mean4= round(mean(GLCM),1), s4=sd(GLCM))

## # A tibble: 2 × 9
##   ST004D01T mean1    s1 mean2    s2 mean3    s3 mean4    s4
##   <fct>     <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1          357.  68.7  351.  75.9  358.  69.2  382.  75.7
## 2 2          346.  74.7  325.  74.1  354.  69.9  358.  71.1

Sex <- c("Female","Male","Female","Male","Female","Male","Female","Male")
Sex <- factor(Sex)
Subject <- c("Math","Math","Reading","Reading","Science","Science","Glcm","Glcm")
Mean <- c(357.3,345.6,351.2,324.6,357.8,354.2,381.9,357.5)
SD <- c(68.7,74.7,75.9,74.1,69.2,69.9,75.7,71.1)
df <- data.frame(Subject,Sex,Mean,SD)
df %>% 
  ggplot(aes(x=reorder(Subject,Mean),y=Mean, ymin= Mean-SD, ymax=Mean+SD, fill= Sex)) +
  geom_bar(stat="identity",width=0.8, color = "black", position="dodge") +
  scale_x_discrete(expand = c(0,0)) +
  scale_y_continuous(expand = c(0,0), limits = c(0,500)) +
  geom_text(aes(label=Mean), vjust=8, position = position_dodge(0.9), size=3.5) +
  labs(x= "Subject area") +
  geom_errorbar(aes(Subject), width=0.1, color="black", position=position_dodge(0.9))

For all the subject areas, females perform better in the given subjects compared to males based on the means shown in the graph although there is no big difference in the mean values in Science between the two sex. For reading and global competency(GLCM), there is a noticeable difference in the means between males and females.

13.

MATHR <-round(rowMeans(subset(phil_pisa20181,select = c(PV1MATH,PV2MATH,PV3MATH,PV4MATH,PV5MATH,PV6MATH,PV7MATH,PV8MATH,PV9MATH,PV10MATH))),0)
READINGR <-round(rowMeans(subset(phil_pisa20181,select = c(PV1READ,PV2READ,PV3READ,PV4READ,PV5READ,PV6READ,PV7READ,PV8READ,PV9READ,PV10READ))),0)
phil_pisa2018 <- cbind(phil_pisa2018,MATHR,READINGR)
phil_pisa2018 <- phil_pisa2018[-1122:-1125]
Readprof <- phil_pisa2018 %>%
  select(READINGR, REGION) %>%
  group_by(REGION) %>% 
  mutate(ReadingLevel = case_when(READINGR < 262 ~ "1c",
                           READINGR >= 262 & READINGR <= 334 ~ "1b",
                           READINGR >= 335 & READINGR <= 406 ~ "1a",
                           READINGR >= 407 & READINGR <= 479 ~ "2",
                           READINGR >= 480 & READINGR <= 552 ~ "3",
                           READINGR >= 553 & READINGR <= 625 ~ "4",
                           READINGR >= 626 & READINGR <= 697 ~ "5",
                           READINGR >= 698 ~ "6"))
Readprof1 <- phil_pisa20181 %>% 
  select(REGION, READING) %>% 
  group_by(REGION) %>% 
  summarize(meanread = round(mean(READING),0)) %>% 
  mutate(RegionalReadingLevel = case_when(meanread < 262 ~ "1c",
                           meanread >= 262 & meanread <= 334 ~ "1b",
                           meanread >= 335 & meanread <= 406 ~ "1a",
                           meanread >= 407 & meanread <= 479 ~ "2",
                           meanread >= 480 & meanread <= 552 ~ "3",
                           meanread >= 553 & meanread <= 625 ~ "4",
                           meanread >= 626 & meanread <= 697 ~ "5",
                           meanread >= 698 ~ "6")) 
Readprof %>% 
  select(ReadingLevel) %>% 
  group_by(ReadingLevel) %>% 
  summarize(count=n()) %>%
  mutate(Percentage = (count/sum(count))*100)

## # A tibble: 7 × 3
##   ReadingLevel count Percentage
##   <chr>        <int>      <dbl>
## 1 1a            1917    26.5   
## 2 1b            2942    40.7   
## 3 1c            1022    14.1   
## 4 2              942    13.0   
## 5 3              338     4.67  
## 6 4               70     0.968 
## 7 5                2     0.0277

Readprof <- Readprof %>% 
  select(REGION, ReadingLevel) %>% 
  group_by(REGION,ReadingLevel) %>% 
  summarize(count=n()) %>% 
  mutate(Percentage = (count/sum(count))*100)
Readprof

## # A tibble: 98 × 4
## # Groups:   REGION [17]
##    REGION   ReadingLevel count Percentage
##    <fct>    <chr>        <int>      <dbl>
##  1 Region 1 1a             107     26.4  
##  2 Region 1 1b             181     44.6  
##  3 Region 1 1c              76     18.7  
##  4 Region 1 2               30      7.39 
##  5 Region 1 3               11      2.71 
##  6 Region 1 4                1      0.246
##  7 Region 2 1a              64     25.3  
##  8 Region 2 1b             113     44.7  
##  9 Region 2 1c              43     17.0  
## 10 Region 2 2               28     11.1  
## # … with 88 more rows

Readprof1

## # A tibble: 17 × 3
##    REGION    meanread RegionalReadingLevel
##    <fct>        <dbl> <chr>               
##  1 Region 1       321 1b                  
##  2 Region 2       327 1b                  
##  3 Region 3       339 1a                  
##  4 Region 4A      350 1a                  
##  5 Region 4B      327 1b                  
##  6 Region 5       327 1b                  
##  7 Region 6       334 1b                  
##  8 Region 7       353 1a                  
##  9 Region 8       344 1a                  
## 10 Region 9       309 1b                  
## 11 Region 10      331 1b                  
## 12 Region 11      332 1b                  
## 13 Region 12      297 1b                  
## 14 NCR            372 1a                  
## 15 CAR            354 1a                  
## 16 CARAGA         300 1b                  
## 17 NIR            336 1a

Readprof %>% 
   ggplot(aes(x=REGION,y=Percentage, fill= ReadingLevel)) +
  geom_bar(stat="identity",width=0.65, color = "black", position="stack") +
  coord_flip() +
  scale_x_discrete(expand = c(0,0), labels= c("1","2","3","4A","4B","5","6","7","8","9","10","11","12","NCR","CAR","CARAGA","NIR")) +
  scale_y_continuous(expand = c(0,0), limits = c(0,105)) +
  labs(x= "Region") +
  theme_classic() +
  scale_fill_discrete(name="Level", breaks=c("1c","1b","1a","2","3","4","5"))

The majority of students from each of 17 regions has a reading proficiency level 1b with Region 12 having the highest percentage with almost 50%. Level 1c is considered to be the most inferior among reading proficiency levels with readers at this level can only understand and affirm the meaning of short, simple sentences on a literal level, and read for a clear and simple purpose within a limited amount of time. CARAGA has the highest percentage(over 30%) of students with level 1c which suggests that the region has the worst reading proficiency.Meanwhile, Region 4A has the best reading proficiency as there are two students from the region,which are the only students from the over-all sample,that has a reading proficiency level 5. Readers at level 5 has an excellent reading proficiency with the ability to comprehend lengthy texts, inferring which information in the text is relevant even though the information of interest may be easily overlooked. On average, NCR has a 1a reading proficiency and can also be considered to have an excellent reading proficiency compared to other regions as it has the highest mean score on reading.

14.

Mathprof <- phil_pisa2018 %>%
  select(MATHR, REGION) %>%
  mutate(MathLevel = case_when(MATHR < 420 ~ "1",
                           MATHR >= 420 & MATHR <= 481 ~ "2",
                           MATHR >= 482 & MATHR <= 544 ~ "3",
                           MATHR >= 545 & MATHR <= 606 ~ "4",
                           MATHR >= 607 & MATHR <= 668 ~ "5",
                           MATHR > 668 ~ "6"))
Mathprof1 <- phil_pisa20181 %>% 
  select(REGION, MATH) %>% 
  group_by(REGION) %>% 
  summarize(meanmath = round(mean(MATH),0)) %>% 
  mutate(RegionalMathLevel = case_when(meanmath < 420 ~ "1",
                           meanmath >= 420 & meanmath <= 481 ~ "2",
                           meanmath >= 482 & meanmath <= 544 ~ "3",
                           meanmath >= 545 & meanmath <= 606 ~ "4",
                           meanmath >= 607 & meanmath <= 668 ~ "5",
                           meanmath > 668 ~ "6"))
Mathprof %>% 
  select(MathLevel) %>% 
  group_by(MathLevel) %>% 
  summarize(count=n()) %>%
  mutate(Percentage = (count/sum(count))*100)

## # A tibble: 5 × 3
##   MathLevel count Percentage
##   <chr>     <int>      <dbl>
## 1 1          5978    82.6   
## 2 2           919    12.7   
## 3 3           295     4.08  
## 4 4            39     0.539 
## 5 5             2     0.0277

Mathprof <- Mathprof %>% 
  select(REGION, MathLevel) %>% 
  group_by(REGION,MathLevel) %>% 
  summarize(count=n()) %>% 
  mutate(Percentage = (count/sum(count))*100)
Mathprof

## # A tibble: 60 × 4
## # Groups:   REGION [17]
##    REGION   MathLevel count Percentage
##    <fct>    <chr>     <int>      <dbl>
##  1 Region 1 1           364     89.7  
##  2 Region 1 2            25      6.16 
##  3 Region 1 3            17      4.19 
##  4 Region 2 1           213     84.2  
##  5 Region 2 2            36     14.2  
##  6 Region 2 3             4      1.58 
##  7 Region 3 1           722     82.2  
##  8 Region 3 2           111     12.6  
##  9 Region 3 3            39      4.44 
## 10 Region 3 4             6      0.683
## # … with 50 more rows

Mathprof1

## # A tibble: 17 × 3
##    REGION    meanmath RegionalMathLevel
##    <fct>        <dbl> <chr>            
##  1 Region 1       342 1                
##  2 Region 2       345 1                
##  3 Region 3       355 1                
##  4 Region 4A      365 1                
##  5 Region 4B      337 1                
##  6 Region 5       338 1                
##  7 Region 6       345 1                
##  8 Region 7       364 1                
##  9 Region 8       341 1                
## 10 Region 9       330 1                
## 11 Region 10      342 1                
## 12 Region 11      342 1                
## 13 Region 12      314 1                
## 14 NCR            385 1                
## 15 CAR            370 1                
## 16 CARAGA         305 1                
## 17 NIR            342 1

Mathprof %>% 
   ggplot(aes(x=REGION,y=Percentage, fill= MathLevel)) +
  geom_bar(stat="identity",width=0.65, color = "black", position="stack") +
  coord_flip() +
  scale_x_discrete(expand = c(0,0), labels= c("1","2","3","4A","4B","5","6","7","8","9","10","11","12","NCR","CAR","CARAGA","NIR")) +
  scale_y_continuous(expand = c(0,0), limits = c(0,105)) +
  labs(x= "Region") +
  theme_classic() +
  scale_fill_discrete(name ="Level")

Mathematics proficiency at level 1 is the worst proficiency a student from the sample can get. Based from the computed percentages, majority of students from each of 17 regions has a mathematics proficiency at level 1 with CARAGA having the highest number compared to other regions with over 95%.This implies that CARAGA has the worst mathematics proficiency with majority of its students from the sample can only answer questions involving familiar contexts where relevant information is present and when questions are clearly defined. Only two students from the sample got a reading proficiency level 5, from Region 4A and the other student from Region 6, which is the best proficiency level a student can get. Student who got level 5 has an excellent mathematics proficiency with the ability to develop and work with models for complex situations, identifying constraints and specifying assumptions. On average, NCR can be considered to have an excellent mathematics proficiency compared to other regions as it has the highest percentage of students at Level 3 and has the least percentage of students at Level 1 among all regions.

phil_pisa2018 <- phil_pisa2018 %>% 
  mutate(Sex = case_when(ST004D01T == 1 ~ "Female",
                         ST004D01T == 2 ~ "Male"))

15.

Readprof2 = phil_pisa2018 %>%
  select(READINGR,ST001D01T, Sex) %>%
  mutate(ReadingLevel = case_when(READINGR < 262 ~ "1c",
                           READINGR >= 262 & READINGR <= 334 ~ "1b",
                           READINGR >= 335 & READINGR <= 406 ~ "1a",
                           READINGR >= 407 & READINGR <= 479 ~ "2",
                           READINGR >= 480 & READINGR <= 552 ~ "3",
                           READINGR >= 553 & READINGR <= 625 ~ "4",
                           READINGR >= 626 & READINGR <= 697 ~ "5",
                           READINGR >= 698 ~ "6"))
Readprof3 <- phil_pisa20181 %>% 
  select(READING,ST001D01T,ST004D01T) %>% 
  group_by(ST001D01T,ST004D01T) %>% 
  summarize(meanread1 = round(mean(READING),0)) %>% 
  mutate(MeanReadingLevel = case_when(meanread1 < 262 ~ "1c",
                           meanread1 >= 262 & meanread1 <= 334 ~ "1b",
                           meanread1 >= 335 & meanread1 <= 406 ~ "1a",
                           meanread1 >= 407 & meanread1 <= 479 ~ "2",
                           meanread1 >= 480 & meanread1 <= 552 ~ "3",
                           meanread1 >= 553 & meanread1 <= 625 ~ "4",
                           meanread1 >= 626 & meanread1 <= 697 ~ "5",
                           meanread1 >= 698 ~ "6")) 
Readprof2 <- Readprof2 %>% 
  select(ReadingLevel,ST001D01T, Sex) %>% 
  group_by(ST001D01T,Sex, ReadingLevel) %>% 
  summarize(count=n()) %>% 
  mutate(Percentage = (count/sum(count))*100)
Readprof2

## # A tibble: 57 × 5
## # Groups:   ST001D01T, Sex [12]
##    ST001D01T   Sex    ReadingLevel count Percentage
##    <dbl+lbl>   <chr>  <chr>        <int>      <dbl>
##  1 7 [Grade 7] Female 1a               6      4.69 
##  2 7 [Grade 7] Female 1b              60     46.9  
##  3 7 [Grade 7] Female 1c              61     47.7  
##  4 7 [Grade 7] Female 2                1      0.781
##  5 7 [Grade 7] Male   1a               7      3.72 
##  6 7 [Grade 7] Male   1b              81     43.1  
##  7 7 [Grade 7] Male   1c              99     52.7  
##  8 7 [Grade 7] Male   2                1      0.532
##  9 8 [Grade 8] Female 1a              61     15.4  
## 10 8 [Grade 8] Female 1b             202     50.9  
## # … with 47 more rows

Readprof3

## # A tibble: 12 × 4
## # Groups:   ST001D01T [6]
##    ST001D01T     ST004D01T meanread1 MeanReadingLevel
##    <dbl+lbl>     <fct>         <dbl> <chr>           
##  1  7 [Grade 7]  1               268 1b              
##  2  7 [Grade 7]  2               264 1b              
##  3  8 [Grade 8]  1               291 1b              
##  4  8 [Grade 8]  2               280 1b              
##  5  9 [Grade 9]  1               349 1a              
##  6  9 [Grade 9]  2               328 1b              
##  7 10 [Grade 10] 1               379 1a              
##  8 10 [Grade 10] 2               354 1a              
##  9 11 [Grade 11] 1               447 2               
## 10 11 [Grade 11] 2               436 2               
## 11 12 [Grade 12] 1               507 3               
## 12 12 [Grade 12] 2               535 3

Readprof2 %>%
  ggplot(aes(x=ST001D01T, y= Percentage)) +
  geom_bar(aes(fill=ReadingLevel),width = 0.7, stat="identity", position = "stack", color="black")+
  facet_wrap(~Sex) +
  scale_x_continuous(expand = c(0,0)) +
  scale_y_continuous(expand= c(0,0), limits = c(0,105)) +
  theme_bw() +
  labs(x="Grade Level") +
  scale_fill_discrete(name="Level", breaks=c("1c","1b","1a","2","3","4","5"))

For both males and females, Grade 7 has the lowest proficiency level in reading with males having a higher percentage than females in Grade 7 students who got a level 1c.The two students who only got the level 5 reading proficiency are Grade 10 male and female students. On average, it is evident that the Grade 12 has the highest proficiency level in reading for both sex with proficiency level 3 though the number of sample in this grade level, which is three students, perhaps plays a role on the result.

Mathprof1 <- phil_pisa2018 %>%
  select(MATHR,ST001D01T, Sex) %>%
  mutate(MathLevel = case_when(MATHR < 420 ~ "1",
                           MATHR >= 420 & MATHR <= 481 ~ "2",
                           MATHR >= 482 & MATHR <= 544 ~ "3",
                           MATHR >= 545 & MATHR <= 606 ~ "4",
                           MATHR >= 607 & MATHR <= 668 ~ "5",
                           MATHR > 668 ~ "6"))
Mathprof3 <- phil_pisa20181 %>% 
  select(MATH,ST001D01T,ST004D01T) %>% 
  group_by(ST001D01T,ST004D01T) %>% 
  summarize(meanmath1 = round(mean(MATH),0)) %>%
  mutate(MeanMathLevel = case_when(meanmath1 < 420 ~ "1",
                           meanmath1 >= 420 & meanmath1 <= 481 ~ "2",
                           meanmath1 >= 482 & meanmath1 <= 544 ~ "3",
                           meanmath1 >= 545 & meanmath1 <= 606 ~ "4",
                           meanmath1 >= 607 & meanmath1 <= 668 ~ "5",
                           meanmath1 > 668 ~ "6"))
Mathprof1 <- Mathprof1 %>% 
  select(ST001D01T,Sex, MathLevel) %>% 
  group_by(ST001D01T,Sex, MathLevel) %>% 
  summarize(count=n()) %>% 
  mutate(Percentage = (count/sum(count))*100)
Mathprof1

## # A tibble: 38 × 5
## # Groups:   ST001D01T, Sex [12]
##    ST001D01T   Sex    MathLevel count Percentage
##    <dbl+lbl>   <chr>  <chr>     <int>      <dbl>
##  1 7 [Grade 7] Female 1           127     99.2  
##  2 7 [Grade 7] Female 2             1      0.781
##  3 7 [Grade 7] Male   1           186     98.9  
##  4 7 [Grade 7] Male   2             2      1.06 
##  5 8 [Grade 8] Female 1           384     96.7  
##  6 8 [Grade 8] Female 2            13      3.27 
##  7 8 [Grade 8] Male   1           500     96.9  
##  8 8 [Grade 8] Male   2            14      2.71 
##  9 8 [Grade 8] Male   3             2      0.388
## 10 9 [Grade 9] Female 1          1635     83.5  
## # … with 28 more rows

Mathprof3

## # A tibble: 12 × 4
## # Groups:   ST001D01T [6]
##    ST001D01T     ST004D01T meanmath1 MeanMathLevel
##    <dbl+lbl>     <fct>         <dbl> <chr>        
##  1  7 [Grade 7]  1               272 1            
##  2  7 [Grade 7]  2               270 1            
##  3  8 [Grade 8]  1               302 1            
##  4  8 [Grade 8]  2               297 1            
##  5  9 [Grade 9]  1               355 1            
##  6  9 [Grade 9]  2               351 1            
##  7 10 [Grade 10] 1               383 1            
##  8 10 [Grade 10] 2               377 1            
##  9 11 [Grade 11] 1               449 2            
## 10 11 [Grade 11] 2               464 2            
## 11 12 [Grade 12] 1               484 3            
## 12 12 [Grade 12] 2               501 3

Mathprof1 %>%
  ggplot(aes(x=ST001D01T, y= Percentage)) +
  geom_bar(aes(fill=MathLevel), stat="identity",width = 0.7, position = "stack", color="black")+
  facet_wrap(~Sex) +
  scale_x_continuous(expand = c(0,0)) +
  scale_y_continuous(expand= c(0,0), limits = c(0,105)) +
  theme_bw() +
  labs(x="Grade Level") +
  scale_fill_discrete(name="Level")

For both males and females, Grade 7 has the lowest proficiency level in Mathematics with females outnumbering males as Grade 7 female students has almost 100% of its students at level 1 in mathematics proficiency. Only two male students got the mathematics proficiency level 5 from the sample, one from Grade 9 and the other from Grade 10.On average, it is evident that the Grade 12 has the highest proficiency level in mathematics for both sex with proficiency level 3 though the number of sample in this grade level, which is three students, perhaps plays a role on the result.

16.

phil_pisa20181 %>% 
  select(ST001D01T,ST004D01T, MATH, READING) %>% 
  filter(ST001D01T %in% c("9","10")) %>%
  ggplot(aes(x=MATH, y=READING, color=as.factor(ST001D01T), shape=as.factor(ST004D01T))) +
  geom_point(size=2.5) +
  scale_shape_discrete(name = "Sex", labels = c("Female", "Male")) +
  scale_colour_discrete(name = "Grade")

phil_pisa20181 %>% 
  select(ST001D01T, ST004D01T, SCIENCE, GLCM) %>% 
  filter(ST001D01T %in% c("9","10")) %>%
  ggplot(aes(x=SCIENCE, y=GLCM, color=as.factor(ST001D01T), shape=as.factor(ST004D01T))) +
  geom_point(size=2.5) +
  scale_shape_discrete(name = "Sex", labels = c("Female", "Male")) +
  scale_colour_discrete(name = "Grade")

phil_pisa20181 %>% 
  select(ST001D01T, ST004D01T, MATH, SCIENCE) %>% 
  filter(ST001D01T %in% c("9","10")) %>%
  ggplot(aes(x=MATH, y=SCIENCE, color=as.factor(ST001D01T), shape=as.factor(ST004D01T))) +
  geom_point(size=2.5) +
  scale_shape_discrete(name = "Sex", labels = c("Female", "Male")) +
  scale_colour_discrete(name = "Grade")

phil_pisa20181 %>% 
  select(ST001D01T, ST004D01T, SCIENCE, READING) %>% 
  filter(ST001D01T %in% c("9","10")) %>%
  ggplot(aes(x=SCIENCE, y=READING, color=as.factor(ST001D01T), shape=as.factor(ST004D01T)))+ geom_point(size=2.5) +
  scale_shape_discrete(name = "Sex", labels = c("Female", "Male")) +
  scale_colour_discrete(name = "Grade")

phil_pisa20181 %>% 
  select(ST001D01T, ST004D01T, MATH, GLCM) %>% 
  filter(ST001D01T %in% c("9","10")) %>%
  ggplot(aes(x=MATH, y=GLCM, color=as.factor(ST001D01T), shape=as.factor(ST004D01T))) +
  geom_point(size=2.5) +
  scale_shape_discrete(name = "Sex", labels = c("Female", "Male")) +
  scale_colour_discrete(name = "Grade")

phil_pisa20181 %>% 
  select(ST001D01T, ST004D01T, READING, GLCM) %>% 
  filter(ST001D01T %in% c("9","10")) %>%
  ggplot(aes(x=READING, y=GLCM, color=as.factor(ST001D01T), shape=as.factor(ST004D01T))) +
  geom_point(size=2.5) +
  scale_shape_discrete(name = "Sex", labels = c("Female", "Male")) +
  scale_colour_discrete(name = "Grade")

It is evident that there is a relationship between MATH, READING, SCIENCE, and GLCM based from the six plots.The relationship between the said subjects is the same for Grade 9 and 10 students, between males and females, which both shows a positive correlation. That is, if a male or female Grade 9 or 10 student’s score in a certain subject is high, then it is most probable that his/her score is also high for the other subject.

Conclusion

After exploring the data set, it can be concluded that on average, NCR performs best on the three subject areas: Mathematics, Reading, Science while CARAGA and Region 12 has the worst mean score on the said subjects. In addition, NCR still has the best Mathematics and Reading proficiency among the 17 regions while CARAGA and Region 12 still have the worst proficiency level. Consequently, NCR has highest mean in global competency and CARAGA has the lowest mean. One of the factor that may affect the performance of students on the given subjects is having internet connection. Based on the given data set, approximately 63% of the sample of the students in NCR have internet connection while more or less than 32% of the participants from CARAGA and Region 12 has connectivity in the internet. This may imply that having internet connection can positively impact the academic performance of a student. For the over all performance of the 17 regions, about 40% of the sample has a reading proficiency level 1b and this is the highest percentage among the reading proficiency level.This indicates a poor reading proficiency of the majority of students in the sample with readers at this level can only evaluate the literal meaning of simple sentences. Moreover, over 82% of the sample has a Mathematics proficiency level 1 which suggests that most participants has a very poor mathematics proficiency as well. Students at this level has only the ability to answer questions involving familiar contexts where relevant information is present and when questions are clearly defined. Thus, based from this data set, majority of Grade 7 to 12 students has a poor Reading and Mathematics proficiency, and Grade 7 students being the dominant number of this majority. It is also noteworthy that as the Grade level increases, the proficiency level seems to become better, especially in Mathematics proficiency. In addition, the average score on each subject of the participants does not have a huge difference between sex, though females slightly perform better relative to male high school students. Nonetheless, the overall exploration of the given data set highly suggests that almost all high school students in the sample has poor to average knowledge on the given subject areas.

Final Project

Trixia Gonzales

2023-02-02