#1. Load the phil_pisa2018.Rda data set in R/RStudio. Study carefully every column in the data. This is the data for the Philippines of the Programme for International Student Assessment (PISA) 2018 which was conducted by the Organization for Economic Cooperation and Development (OECD). To know more about PISA, browse to PISA 2018

#2. Create a new variable REGION which will contain the labels of the 17 regions in the Philippines. The regions in the data set are coded under the variable STRATUM.

phil_pisa2018_region <- phil_pisa2018 %>% 
  mutate(REGION_NEW = case_when(STRATUM=="PHL0001"~"Region 1",
                          STRATUM=="PHL0002"~"Region 2",
                          STRATUM=="PHL0003"~"Region 3",
                          STRATUM=="PHL0004"~"Region 4A",
                          STRATUM=="PHL0005"~"Region 4B",
                          STRATUM=="PHL0006"~"Region 5",
                          STRATUM=="PHL0007"~"Region 6",
                          STRATUM=="PHL0008"~"Region 7",
                          STRATUM=="PHL0009"~"Region 8",
                          STRATUM=="PHL0010"~"Region 9",
                          STRATUM=="PHL0011"~"Region 10",
                          STRATUM=="PHL0012"~"Region 11",
                          STRATUM=="PHL0013"~"Region 12",
                          STRATUM=="PHL0014"~"NCR",
                          STRATUM=="PHL0015"~"CAR",
                          STRATUM=="PHL0016"~"CARAGA",
                          STRATUM=="PHL0017"~"NIR"))
region_new <- phil_pisa2018_region %>% 
  select(REGION_NEW)
head(region_new)

#3. Compute mean of the variables PV1MATH, PV2MATH, PV3MATH, PV4MATH, PV5MATH, PV6MATH, PV7MATH, PV8MATH, PV9MATH, and PV10MATH. Label this new variable as MATH.

phil_pisa2018_math <- phil_pisa2018_region %>% 
  mutate(MATH=rowMeans(select(phil_pisa2018_region,c(PV1MATH,PV2MATH,PV3MATH,PV4MATH,PV5MATH,PV6MATH,PV7MATH,PV8MATH,PV9MATH,PV10MATH)), na.rm = TRUE))
math <- phil_pisa2018_math %>% 
  select(MATH)
head(math)

#4. Compute the mean of the variables PV1READ, PV2READ, PV3READ, PV4READ, PV5READ, PV6READ, PV7READ, PV8READ, PV9READ, and PV10READ. Label this new variable as READING.

phil_pisa2018_reading<-phil_pisa2018_math %>% 
  mutate(READING=rowMeans(select(phil_pisa2018_math,c(PV1READ,PV2READ,PV3READ, PV4READ, PV5READ, PV6READ, PV7READ, PV8READ, PV9READ, PV10READ)), na.rm = TRUE))
reading <- phil_pisa2018_reading %>% 
  select(READING) 
head(reading)

#5. Compute the mean of the variables PV1SCIE, PV2SCIE, PV3SCIE, PV4SCIE, PV5SCIE, PV6SCIE, PV7SCIE, PV8SCIE, PV9SCIE, and PV10SCIE. Label this variable as SCIENCE.

phil_pisa2018_science <- phil_pisa2018_reading %>% 
  mutate(SCIENCE=rowMeans(select(phil_pisa2018_reading,c(PV1SCIE,PV2SCIE,PV3SCIE, PV4SCIE, PV5SCIE, PV6SCIE, PV7SCIE, PV8SCIE, PV9SCIE, PV10SCIE)), na.rm = TRUE))
science <- phil_pisa2018_science %>% 
  select(SCIENCE)
head(science)

#6. Compute the mean of the variables PV1GLCM, PV2GLCM, PV3GLCM, PV4GLCM, PV5GLCM, PV6GLCM, PV7GLCM, PV8GLCM, PV9GLCM, and PV10GLCM. Call this new variable as GLCM (Global Competency).

phil_pisa2018_glcm <- phil_pisa2018_science %>% 
  mutate(GLCM=rowMeans(select(phil_pisa2018_science,c(PV1GLCM,PV2GLCM,PV3GLCM, PV4GLCM, PV5GLCM, PV6GLCM, PV7GLCM, PV8GLCM, PV9GLCM, PV10GLCM)), na.rm = TRUE))
glcm <- phil_pisa2018_glcm %>% 
  select(GLCM)
head(glcm)

#7. Generate a visualization of the frequency distribution of participants disaggregated according to region and sex. Which region has the most and least number of students in the sample? Which sex has greater number of participants?

phil_pisa2018_glcm$ST004D01T = as.factor(phil_pisa2018$ST004D01T)

phil_pisa2018_glcm %>% 
  mutate(SEX= case_when(ST004D01T=="1"~"Female",
                         ST004D01T=="2"~"Male")) %>%
  select(REGION_NEW, SEX) %>% 
  drop_na(REGION_NEW, SEX) %>%
  group_by(REGION_NEW,SEX) %>% 
  summarise(Count=n()) %>% 
  ggplot(aes(x=reorder(REGION_NEW,Count), y=Count, fill=SEX)) +
  geom_bar(stat = "identity", position = position_dodge(0.8), color="black") +
  geom_text(aes(label=Count),vjust=0.18, hjust= -0.3, color="black",
            position = position_dodge(0.8), size=2.8) +
  scale_y_continuous(expand=c(0,0), limit=c(0,650)) +
  labs(x="Region", y="Number of Students", title="Number of Participants According to Region and Sex") +
  theme_classic() +
  coord_flip()
## `summarise()` has grouped output by 'REGION_NEW'. You can override using the
## `.groups` argument.

Answer: The region with the most number of participants is Region 4A and the least one is CAR.Based on the data, there are more “Female” participants compared to “Male”.

#8. Generate a visualization of the frequency distribution of participants disaggregated according to grade level and sex. Which grade level has the most and least number of students in the sample?

phil_pisa2018_glcm$ST001D01T= as.factor(phil_pisa2018$ST001D01T)
phil_pisa2018$ST004D01T = as.factor(phil_pisa2018$ST004D01T)
phil_pisa2018_glcm %>% 
  mutate(GRADE_LEVEL= case_when(ST001D01T=="7"~"Grade 7",
                                ST001D01T=="8"~"Grade 8",
                                ST001D01T=="9"~"Grade 9",
                                ST001D01T=="10"~"Grade 10",
                                ST001D01T=="11"~"Grade 11",
                                ST001D01T=="12"~"Grade 12")) %>%
  mutate(SEX= case_when(ST004D01T=="1"~"Female",
                         ST004D01T=="2"~"Male")) %>%
  select(SEX, GRADE_LEVEL) %>% 
  drop_na(SEX, GRADE_LEVEL) %>%
  group_by(SEX,GRADE_LEVEL) %>% 
  summarise(Count=n()) %>% 
  ggplot(aes(x=reorder(GRADE_LEVEL,Count), y=Count, fill=SEX)) +
  geom_bar(stat = "identity", position = position_dodge(), color="black") +
  geom_text(aes(label=Count),vjust=0.18, hjust= -0.3, color="black",
            position = position_dodge(0.8), size=2.8) +
  scale_y_continuous(expand = c(0,0), limit=c(0,2100)) +
  labs(x="Grade Level", y="Number of Students", title="Number of Participants According to Grade Level and Sex") +
  theme_classic() +
  coord_flip()
## `summarise()` has grouped output by 'SEX'. You can override using the `.groups`
## argument.

Answer: The grade level with the most and least number of students in the sample is “Grade 9” and “Grade 12”, respectively.

#9. Generate a visualization of the frequency distribution of students who have internet access grouped according to region. In which region has the most and least number of students with internet access?

phil_pisa2018_glcm$ST011Q06TA = as.factor(phil_pisa2018$ST011Q06TA)
phil_pisa2018_glcm %>% 
  mutate(INTERNET_ACCESS= case_when(ST011Q06TA=="1"~"YES",
                                ST011Q06TA=="2"~"NO")) %>%
  select(REGION_NEW, INTERNET_ACCESS) %>% 
  drop_na(REGION_NEW, INTERNET_ACCESS) %>%
  group_by(REGION_NEW, INTERNET_ACCESS) %>% 
  summarise(Count=n()) %>% 
  ggplot(aes(x=reorder(REGION_NEW,Count), y=Count, fill=INTERNET_ACCESS)) +
  geom_bar(stat = "identity", position = position_dodge(), color="black") +
  geom_text(aes(label=Count),vjust=0.4, hjust= -0.3, color="black",
            position = position_dodge(1), size=2.8) +
  scale_y_continuous(expand = c(0,0), limit=c(0,700)) +
  labs(x="REGION", y="Number of Students", title="Number of Students with Internet Access According to Region") +
  theme_classic() +
  coord_flip() +
  scale_fill_discrete(name= "Internet Access", labels= c("YES","NO"))
## `summarise()` has grouped output by 'REGION_NEW'. You can override using the
## `.groups` argument.

Answer: The region with the most and least number of students with internet access is “Region 4A” and “CAR”, respectively.

#10. Generate a visualization of the empirical probability distribution of MATH, READING, SCIENCE, and GLCM. Superimpose a normal curve on each plot. Describe the shape of the empirical distribution. Are the distributions normally distributed? Are there outliers?

math%>% 
  drop_na(MATH) %>% 
  ggplot(aes(MATH)) + 
   geom_histogram(aes(y = after_stat(density)),
                 colour="black",
                 bins = 15,
                 fill="light blue") +
  scale_y_continuous(expand = c(0,0), limits= c(0,0.006)) +
  stat_function(fun = dnorm, 
                args = list(mean= mean(math$MATH), sd = sd(math$MATH)), col="red") +
  theme_bw()

reading%>% 
  drop_na(READING) %>% 
  ggplot(aes(READING)) + 
   geom_histogram(aes(y = after_stat(density)),
                 colour="black",
                 bins = 15,
                 fill="light blue") +
  scale_y_continuous(expand = c(0,0), limits= c(0,0.006)) +
  stat_function(fun = dnorm, 
                args = list(mean= mean(reading$READING), sd = sd(reading$READING)), col="red") +
  theme_bw()

science%>% 
  drop_na(SCIENCE) %>% 
  ggplot(aes(SCIENCE)) + 
   geom_histogram(aes(y = after_stat(density)),
                 colour="black",
                 bins = 15,
                 fill="light blue") +
  scale_y_continuous(expand = c(0,0), limits= c(0,0.007)) +
  scale_x_continuous(expand=c(0,0)) +
  stat_function(fun = dnorm, 
                args = list(mean= mean(science$SCIENCE), sd = sd(science$SCIENCE)), col="red") +
  theme_bw()

glcm%>% 
  drop_na(GLCM) %>% 
  ggplot(aes(GLCM)) + 
   geom_histogram(aes(y = after_stat(density)),
                 colour="black",
                 bins = 17,
                 fill="light blue") +
  scale_y_continuous(expand = c(0,0), limits= c(0,0.007)) +
  stat_function(fun = dnorm, 
                args = list(mean= mean(glcm$GLCM), sd = sd(glcm$GLCM)), col="red") +
  theme_bw()

Answer: The MATH data shows a normally distributed histogram while READING,SCIENCE, AND GLCM shows a positively skewed distributuion. All of them have no outliers detected.

#11. Generate a table of means and standard error of the means of MATH, READING, SCIENCE, and GLCM by region. Which region performs best and worst in these subjects areas?

library(knitr)
SAMPLE <- phil_pisa2018_glcm %>%
  select(REGION,MATH, READING,SCIENCE,GLCM) %>% 
 group_by(REGION) %>% 
 summarise(mu_math = mean(MATH),
           mu_reading = mean(READING), 
           mu_science = mean(SCIENCE),
           mu_glcm = mean(GLCM), 
           se_math = sd(MATH)/sqrt(n()),
           se_reading = sd(READING)/sqrt(n()),
           se_science = sd(SCIENCE)/sqrt(n()),
           se_glcm = sd(GLCM)/sqrt(n()))
TABLE <- SAMPLE %>% 
  mutate(mu_subs = rowMeans(select(SAMPLE,c(mu_math,
                                              mu_reading,
                                              mu_science,
                                              mu_glcm)))) 
TABLE
kable(TABLE)
REGION mu_math mu_reading mu_science mu_glcm se_math se_reading se_science se_glcm mu_subs
Region 1 341.8855 321.0015 340.7932 353.3641 3.193656 3.309686 3.028230 3.278626 339.2611
Region 2 344.6461 326.6504 346.8693 363.0625 4.168193 4.103345 3.832510 4.170575 345.3071
Region 3 355.4606 339.0414 350.3069 367.9459 2.339399 2.436344 2.233627 2.350921 353.1887
Region 4A 365.4433 349.8223 365.8842 380.0412 1.922768 2.152546 2.006841 2.123175 365.2978
Region 4B 337.1956 326.5423 339.2683 354.9788 4.351104 4.286446 3.959615 4.300548 339.4962
Region 5 338.2182 327.4742 345.0808 363.7356 3.361817 3.384346 3.087450 3.319069 343.6272
Region 6 345.2129 333.8645 354.2873 367.2314 3.963424 4.095302 3.675700 3.838992 350.1490
Region 7 364.0297 353.4637 375.3081 386.8256 3.265602 3.666712 3.148331 3.638791 369.9068
Region 8 341.1649 343.5006 351.3861 370.2218 4.002170 4.255857 3.968101 4.164208 351.5683
Region 9 329.9775 309.2361 331.4282 345.7618 4.315175 4.039748 3.767901 4.035723 329.1009
Region 10 342.0046 330.7653 343.0657 353.5956 4.243819 4.585288 3.957055 4.768487 342.3578
Region 11 342.1345 331.7825 349.3926 362.8933 3.445807 3.935716 3.555330 3.945300 346.5507
Region 12 313.7029 297.4103 326.3249 339.4117 3.691128 3.466314 3.087471 3.304096 319.2125
NCR 384.9278 371.5097 390.4726 401.8871 2.393303 2.794039 2.449867 2.736760 387.1993
CAR 369.8061 354.3165 361.3156 383.1771 6.814103 6.866394 6.520257 6.998269 367.1538
CARAGA 305.2946 299.5162 317.1731 333.1910 4.445785 4.036499 3.637917 3.987865 313.7937
NIR 342.3126 335.8353 359.8877 369.5537 4.359274 4.657765 4.430331 4.569060 351.8973

Answer: The region who performs the best in these subjects is “NCR” with a grand mean of approximately 387.20 while the worst one is “CARAGA” with a grand mean of about 313.79.

#12. Generate a visualization of the means (with error bars) of MATH, READING, SCIENCE, and GLCM by sex. Are there glaring sex differences in these subjects areas?

TRY <-phil_pisa2018_glcm %>%
mutate(SEX= case_when(ST004D01T=="1"~"Female",
                         ST004D01T=="2"~"Male")) %>%  
  group_by(SEX) %>% 
  drop_na(MATH,READING,SCIENCE,GLCM) %>% 
  summarise(mean_math = round(mean(MATH),2),
           mean_reading = round(mean(READING),2), 
           mean_science = round(mean(SCIENCE),2),
           mean_glcm = round(mean(GLCM),2),
           sd_math = round(sd(MATH),2),
           sd_reading = round(sd(READING),2),
           sd_science = round(sd(SCIENCE),2),
           sd_glcm = round(sd(GLCM),2))
errorbar_math <- TRY %>% 
  ggplot(aes(x = SEX, y= mean_math)) + 
 geom_bar(fill = "light blue", stat = "identity",
           position = position_dodge(),color="black") + 
  geom_errorbar(aes(x = SEX, ymin = mean_math-sd_math, ymax =mean_math+sd_math), width=0.1, 
                color = "black",
                position = position_dodge(0.9)) +
  scale_y_continuous(expand=c(0,0),limits=c(0,450) ) +
  geom_text(aes(label=mean_math), vjust=-0.5) +
  labs(x="SEX", y = "Mean of MATH") +
  theme_classic()
errorbar_math

errorbar_reading <- TRY %>% 
  ggplot(aes(x = SEX, y= mean_reading)) + 
 geom_bar(fill = "light blue", stat = "identity",
           position = position_dodge(),color="black") + 
  geom_errorbar(aes(x = SEX, ymin = mean_reading-sd_reading, ymax =mean_reading+sd_reading), width=0.1, 
                 color = "black",
                position = position_dodge(0.9)) +
  scale_y_continuous(expand=c(0,0),limits=c(0,450) ) +
   geom_text(aes(label=mean_reading), vjust=-0.5) +
  labs(x="SEX", y = "Mean of READING") +
  theme_classic()
errorbar_reading

errorbar_science <- TRY %>% 
  ggplot(aes(x = SEX, y= mean_science)) + 
 geom_bar(fill = "light blue", stat = "identity",
           position = position_dodge(),color="black") + 
  geom_errorbar(aes(x = SEX, ymin = mean_science-sd_science, ymax =mean_science+sd_science), width=0.1, 
                color = "black",
                position = position_dodge(0.9)) +
  scale_y_continuous(expand=c(0,0),limits=c(0,450) ) +
   geom_text(aes(label=mean_science), vjust=-0.5) +
  labs(x="SEX", y = "Mean of SCIENCE") +
  theme_classic()
errorbar_science

errorbar_glcm <- TRY %>% 
  ggplot(aes(x = SEX, y= mean_glcm)) + 
 geom_bar(fill = "light blue", stat = "identity",
           position = position_dodge(),color="black") + 
  geom_errorbar(aes(x = SEX, ymin = mean_glcm-sd_glcm, ymax =mean_glcm+sd_glcm), width=0.1, 
                color = "black",
                position = position_dodge(0.9)) +
  scale_y_continuous(expand=c(0,0),limits=c(0,500) ) +
   geom_text(aes(label=mean_glcm), vjust=-0.5) +
  labs(x="SEX", y = "Mean of GLCM") +
  theme_classic()
errorbar_glcm

Answer: Based on the results, we can see that there is a difference between sexes in each subject but the difference in the GLCM is more obvious compared to the others.y,Generally, it shows that female has greater amount of mean per subject compared to male.

#13. Determine the percentage of students in each reading proficiency level for all regions and indicate the regional level of reading proficiency based on the above guide. Describe the data completely based on the supplementary file. Which region has the best and worst reading proficiency?

Reading_Levels <- phil_pisa2018_glcm %>% 
  select(REGION, READING) %>% 
  mutate(Reading_Level = NA %>% 
           replace(READING < 262,"Level 1c") %>% 
           replace(READING >= 262 & READING < 335,"Level 1b") %>% 
           replace(READING >= 335 & READING < 407,"Level 1a") %>% 
           replace(READING >= 407 & READING < 480,"Level 2") %>% 
           replace(READING >= 480 & READING < 553,"Level 3") %>% 
           replace(READING >= 553 & READING < 626,"Level 4") %>% 
           replace(READING >= 626 & READING < 698,"Level 5") %>% 
           replace(READING >= 698,"Level 6")) %>%                 
  group_by(REGION, Reading_Level) %>% 
  
  summarise(count_reading = n()) %>% 
  mutate(frequency_reading = round(count_reading/ sum(count_reading),4)*100)
## `summarise()` has grouped output by 'REGION'. You can override using the
## `.groups` argument.
WIDE_reading<-Reading_Levels %>% 
select(REGION,Reading_Level,frequency_reading) %>% 
  pivot_wider(names_from = Reading_Level,values_from = frequency_reading) 
Reorder <-WIDE_reading <- WIDE_reading[, c(1,4,3,2,5,6,7,8)] 
Reorder

Answer: The region who has the best reading proficiency based on the guide is “NCR” where around 33% of its participants has a reading proficiency of at least Level 2. The region ranked 2nd to the highest of percentage of reading proficiency level 4.It also has the least percentage of participants who has a reading proficiency level of 1c.The region with the worst reading proficiency is “Region 12” where around 95% of the participants belong to reading proficiency level 1( 1a, 1b, and 1c).Moreover, it only contributed about 1% in Level 3 which is very low when in fact,it is the region’s highest reading proficiency level.

#14. Determine the percentage of students in each mathematics proficiency level for all regions and indicate the regional level of mathematics proficiency based on the above guide. Describe the data completely based on the supplementary file. Which region has the best and worst mathematics proficiency?

Math_Levels <- phil_pisa2018_glcm %>% 
  select(REGION, MATH, READING) %>% 
  group_by(REGION) %>% 
  summarise(Math_Level = case_when(MATH < "420"~ "Level 1",
                            MATH >= "420" & MATH < "483" ~ "Level 2",
                            MATH >= "482" & MATH < "545" ~ "Level 3",
                            MATH >= "545" & MATH < "607" ~ "Level 4",
                            MATH >= "607" & MATH < "669" ~ "Level 5",
                            MATH >= "669"  ~ "6")) %>% 
  group_by( REGION,Math_Level) %>% 
  summarise(count_math = n()) %>% 
  mutate(frequency_math = round(count_math/ sum(count_math),4)*100)
## `summarise()` has grouped output by 'REGION'. You can override using the
## `.groups` argument.
## `summarise()` has grouped output by 'REGION'. You can override using the
## `.groups` argument.
Math_Levels %>% 
  select(REGION,Math_Level,frequency_math) %>% 
  pivot_wider(values_from = frequency_math, names_from = Math_Level)

Answer: The region with the best Mathematics proficiency is “NCR” where it has the least percentage of participants who have Mathematics proficiency at Level 1 with just about 68% compared to mostly of the other regions with percentage of around 80% and above. In addition, it has the highest percentage of participants who are in Levels 3 and 4.In contrast, “Region 12” and “CARAGA” have the worst Mathematics proficiency with about 95% of their participants have Level 1 Mathematics proficiency and their highest proficiency level is just Level 3 with less than 1% of their participants .

#15. Which grade level and sex has the highest and lowest proficiency levels in reading and mathematics?

phil_pisa2018_glcm$ST001D01T= as.factor(phil_pisa2018$ST001D01T)
phil_pisa2018$ST004D01T = as.factor(phil_pisa2018$ST004D01T)
math_and_reading <- phil_pisa2018_glcm %>% 
  mutate(GRADE_LEVEL= case_when(ST001D01T=="7"~"Grade 7",
                                ST001D01T=="8"~"Grade 8",
                                ST001D01T=="9"~"Grade 9",
                                ST001D01T=="10"~"Grade 10",
                                ST001D01T=="11"~"Grade 11",
                                ST001D01T=="12"~"Grade 12")) %>%
  mutate(SEX= case_when(ST004D01T=="1"~"Female",
                         ST004D01T=="2"~"Male")) %>%
  select(GRADE_LEVEL,SEX,MATH,READING) %>% 
  drop_na(GRADE_LEVEL,SEX,MATH,READING) %>%
  group_by(GRADE_LEVEL,SEX,MATH,READING)  
math_and_reading
Summary1 <- math_and_reading %>% 
  select(GRADE_LEVEL,SEX,MATH,) %>% 
  summarise(Math_Level = case_when(MATH < "420"~ "Level 1",
                            MATH >= "420" & MATH < "483" ~ "Level 2",
                            MATH >= "482" & MATH < "545" ~ "Level 3",
                            MATH >= "545" & MATH < "607" ~ "Level 4",
                            MATH >= "607" & MATH < "669" ~ "Level 5",
                            MATH >= "669"  ~ "6")) %>% 
  group_by(Math_Level,GRADE_LEVEL,SEX) %>% 
  summarise(count_math = n())
## Adding missing grouping variables: `READING`
## `summarise()` has grouped output by 'GRADE_LEVEL', 'SEX', 'MATH'. You can
## override using the `.groups` argument.
## `summarise()` has grouped output by 'Math_Level', 'GRADE_LEVEL'. You can
## override using the `.groups` argument.
Summary1
SUMMARY2 <-math_and_reading %>% 
  select(GRADE_LEVEL,SEX,READING) %>% 
  mutate(Reading_Level = NA %>% 
           replace(READING < 262,"Level 1c") %>% 
           replace(READING >= 262 & READING < 335,"Level 1b") %>% 
           replace(READING >= 335 & READING < 407,"Level 1a") %>% 
           replace(READING >= 407 & READING < 480,"Level 2") %>% 
           replace(READING >= 480 & READING < 553,"Level 3") %>% 
           replace(READING >= 553 & READING < 626,"Level 4") %>% 
           replace(READING >= 626 & READING < 698,"Level 5") %>% 
           replace(READING >= 698,"Level 6")) %>%                 
  group_by(Reading_Level,GRADE_LEVEL,SEX) %>% 
  summarise(count_reading = n())
## Adding missing grouping variables: `MATH`
## `summarise()` has grouped output by 'Reading_Level', 'GRADE_LEVEL'. You can
## override using the `.groups` argument.
SUMMARY2

Answer:In Mathematics, Grades 9 and 10 have the highest proficiency level of 5 and both are only males.They are also the ones who have least proficiency level in the subject but females are now included.In Reading,only Grade 10 students has representatives who have proficiency level of 5 for both sexes.It is also one of the contributors to have students with proficiency level of 1 for both sexes as well. In conclusion, Grade 10 male students is the group who has the highest and lowest proficiency levels in both subjects.

#16. Is there a relationship betweeen MATH,READING,SCIENCE, and GLCM? Is this relationship the same for Grade 9 and Grade 10 students? for male and female students?

subjects <- phil_pisa2018_glcm %>% 
  select(MATH,READING,SCIENCE,GLCM) %>%
   cor()
subjects
##              MATH   READING   SCIENCE      GLCM
## MATH    1.0000000 0.9041794 0.8776850 0.8398788
## READING 0.9041794 1.0000000 0.9318180 0.9192905
## SCIENCE 0.8776850 0.9318180 1.0000000 0.8857789
## GLCM    0.8398788 0.9192905 0.8857789 1.0000000
GRADE <- phil_pisa2018_glcm %>% 
  select(MATH,READING,SCIENCE,GLCM,ST001D01T) %>% 
  mutate(GRADE_LEVEL1=NA %>%  
           replace(ST001D01T=="9","Grade 9") %>% 
           replace(ST001D01T=="10","Grade 10")) %>% 
                               
  group_by(GRADE_LEVEL1) 
GRADE
ONE<-GRADE %>% 
  group_by(GRADE_LEVEL1) %>% 
  drop_na(GRADE_LEVEL1) %>% 
  summarise("COR M&R"=cor(MATH,READING))
ONE
TWO<-GRADE %>% 
  group_by(GRADE_LEVEL1) %>% 
  drop_na(GRADE_LEVEL1) %>% 
  summarise("COR M&S"=cor(MATH,SCIENCE))
TWO
THREE<-GRADE %>% 
  group_by(GRADE_LEVEL1) %>% 
  drop_na(GRADE_LEVEL1) %>% 
  summarise("COR M&G"=cor(MATH,GLCM))
THREE
FOUR<-GRADE %>% 
  group_by(GRADE_LEVEL1) %>% 
  drop_na(GRADE_LEVEL1) %>% 
  summarise("COR R&S"=cor(READING,SCIENCE))
FOUR
FIVE<-GRADE %>% 
  group_by(GRADE_LEVEL1) %>% 
  drop_na(GRADE_LEVEL1) %>% 
  summarise("COR R$G"=cor(READING,GLCM))
FIVE
SIX<-GRADE %>% 
  group_by(GRADE_LEVEL1) %>% 
  drop_na(GRADE_LEVEL1) %>% 
  summarise("COR S&G"=cor(SCIENCE,GLCM))
SIX
GENDER<-phil_pisa2018_glcm %>% 
  select(MATH,READING,SCIENCE,GLCM,ST004D01T) %>% 
  mutate(SEX= case_when(ST004D01T=="1"~"Female",
                         ST004D01T=="2"~"Male")) %>%
                               
  group_by(SEX) 
GENDER
FIRST<-GENDER %>% 
  group_by(SEX) %>% 
  drop_na(SEX) %>% 
  summarise("COR M&R"=cor(MATH,READING))
FIRST
SECOND<-GENDER %>% 
  group_by(SEX) %>% 
  drop_na(SEX) %>% 
  summarise("COR M&S"=cor(MATH,SCIENCE))
SECOND
THIRD<-FIRST<-GENDER %>% 
  group_by(SEX) %>% 
  drop_na(SEX) %>% 
  summarise("COR M&G"=cor(MATH,GLCM))
THIRD
FOURTH<-FIRST<-GENDER %>% 
  group_by(SEX) %>% 
  drop_na(SEX) %>% 
  summarise("COR R&S"=cor(READING,SCIENCE))
FOURTH
FIFTH<-FIRST<-GENDER %>% 
  group_by(SEX) %>% 
  drop_na(SEX) %>% 
  summarise("COR R&G"=cor(READING,GLCM))
FIFTH
SIXTH<-FIRST<-GENDER %>% 
  group_by(SEX) %>% 
  drop_na(SEX) %>% 
  summarise("COR S&G"=cor(SCIENCE,GLCM))
SIXTH

Answer: The subjects has high positive relationship with each other where it gives a value of 0.80 and above. As shown in the table, there is still a high positive relationship for the Gender and Grade Level specified. They may not give exact same value, they are still close to each other and will lead to the same conclusion. This means that they have a direct relationship where variables will increase or decrease together.

#17. Provide concluding remarks based on your exploration of the data set. Give the important highlights.

Answer: Based on the exploration of the data, the information taken were from the Philippines and it is evident because of the 17 regions mentioned. In terms of participant/students, it is evident on the data that there are more female compared to male in every region. The difference between sexes in each subject is not that high or obvious except for the GLCM where the difference is around 30.The best and worst performing regions in the subjects Math and Reading is NCR, Region 12 and CARAGA , respectively. Generally, we can say that the group of grade 10 male students has both the highest and lowest proficiency levels in Mathematics and Reading.In conclusion, there are only few regions which have high understanding to Mathematics and Reading based on the proficiency level guide.