#1. Load the phil_pisa2018.Rda data set in R/RStudio. Study carefully every column in the data. This is the data for the Philippines of the Programme for International Student Assessment (PISA) 2018 which was conducted by the Organization for Economic Cooperation and Development (OECD). To know more about PISA, browse to PISA 2018
#2. Create a new variable REGION which will contain the labels of the 17 regions in the Philippines. The regions in the data set are coded under the variable STRATUM.
phil_pisa2018_region <- phil_pisa2018 %>%
mutate(REGION_NEW = case_when(STRATUM=="PHL0001"~"Region 1",
STRATUM=="PHL0002"~"Region 2",
STRATUM=="PHL0003"~"Region 3",
STRATUM=="PHL0004"~"Region 4A",
STRATUM=="PHL0005"~"Region 4B",
STRATUM=="PHL0006"~"Region 5",
STRATUM=="PHL0007"~"Region 6",
STRATUM=="PHL0008"~"Region 7",
STRATUM=="PHL0009"~"Region 8",
STRATUM=="PHL0010"~"Region 9",
STRATUM=="PHL0011"~"Region 10",
STRATUM=="PHL0012"~"Region 11",
STRATUM=="PHL0013"~"Region 12",
STRATUM=="PHL0014"~"NCR",
STRATUM=="PHL0015"~"CAR",
STRATUM=="PHL0016"~"CARAGA",
STRATUM=="PHL0017"~"NIR"))
region_new <- phil_pisa2018_region %>%
select(REGION_NEW)
head(region_new)
#3. Compute mean of the variables PV1MATH, PV2MATH, PV3MATH, PV4MATH, PV5MATH, PV6MATH, PV7MATH, PV8MATH, PV9MATH, and PV10MATH. Label this new variable as MATH.
phil_pisa2018_math <- phil_pisa2018_region %>%
mutate(MATH=rowMeans(select(phil_pisa2018_region,c(PV1MATH,PV2MATH,PV3MATH,PV4MATH,PV5MATH,PV6MATH,PV7MATH,PV8MATH,PV9MATH,PV10MATH)), na.rm = TRUE))
math <- phil_pisa2018_math %>%
select(MATH)
head(math)
#4. Compute the mean of the variables PV1READ, PV2READ, PV3READ, PV4READ, PV5READ, PV6READ, PV7READ, PV8READ, PV9READ, and PV10READ. Label this new variable as READING.
phil_pisa2018_reading<-phil_pisa2018_math %>%
mutate(READING=rowMeans(select(phil_pisa2018_math,c(PV1READ,PV2READ,PV3READ, PV4READ, PV5READ, PV6READ, PV7READ, PV8READ, PV9READ, PV10READ)), na.rm = TRUE))
reading <- phil_pisa2018_reading %>%
select(READING)
head(reading)
#5. Compute the mean of the variables PV1SCIE, PV2SCIE, PV3SCIE, PV4SCIE, PV5SCIE, PV6SCIE, PV7SCIE, PV8SCIE, PV9SCIE, and PV10SCIE. Label this variable as SCIENCE.
phil_pisa2018_science <- phil_pisa2018_reading %>%
mutate(SCIENCE=rowMeans(select(phil_pisa2018_reading,c(PV1SCIE,PV2SCIE,PV3SCIE, PV4SCIE, PV5SCIE, PV6SCIE, PV7SCIE, PV8SCIE, PV9SCIE, PV10SCIE)), na.rm = TRUE))
science <- phil_pisa2018_science %>%
select(SCIENCE)
head(science)
#6. Compute the mean of the variables PV1GLCM, PV2GLCM, PV3GLCM, PV4GLCM, PV5GLCM, PV6GLCM, PV7GLCM, PV8GLCM, PV9GLCM, and PV10GLCM. Call this new variable as GLCM (Global Competency).
phil_pisa2018_glcm <- phil_pisa2018_science %>%
mutate(GLCM=rowMeans(select(phil_pisa2018_science,c(PV1GLCM,PV2GLCM,PV3GLCM, PV4GLCM, PV5GLCM, PV6GLCM, PV7GLCM, PV8GLCM, PV9GLCM, PV10GLCM)), na.rm = TRUE))
glcm <- phil_pisa2018_glcm %>%
select(GLCM)
head(glcm)
#7. Generate a visualization of the frequency distribution of participants disaggregated according to region and sex. Which region has the most and least number of students in the sample? Which sex has greater number of participants?
phil_pisa2018_glcm$ST004D01T = as.factor(phil_pisa2018$ST004D01T)
phil_pisa2018_glcm %>%
mutate(SEX= case_when(ST004D01T=="1"~"Female",
ST004D01T=="2"~"Male")) %>%
select(REGION_NEW, SEX) %>%
drop_na(REGION_NEW, SEX) %>%
group_by(REGION_NEW,SEX) %>%
summarise(Count=n()) %>%
ggplot(aes(x=reorder(REGION_NEW,Count), y=Count, fill=SEX)) +
geom_bar(stat = "identity", position = position_dodge(0.8), color="black") +
geom_text(aes(label=Count),vjust=0.18, hjust= -0.3, color="black",
position = position_dodge(0.8), size=2.8) +
scale_y_continuous(expand=c(0,0), limit=c(0,650)) +
labs(x="Region", y="Number of Students", title="Number of Participants According to Region and Sex") +
theme_classic() +
coord_flip()
## `summarise()` has grouped output by 'REGION_NEW'. You can override using the
## `.groups` argument.
Answer: The region with the most number of participants is Region 4A and
the least one is CAR.Based on the data, there are more “Female”
participants compared to “Male”.
#8. Generate a visualization of the frequency distribution of participants disaggregated according to grade level and sex. Which grade level has the most and least number of students in the sample?
phil_pisa2018_glcm$ST001D01T= as.factor(phil_pisa2018$ST001D01T)
phil_pisa2018$ST004D01T = as.factor(phil_pisa2018$ST004D01T)
phil_pisa2018_glcm %>%
mutate(GRADE_LEVEL= case_when(ST001D01T=="7"~"Grade 7",
ST001D01T=="8"~"Grade 8",
ST001D01T=="9"~"Grade 9",
ST001D01T=="10"~"Grade 10",
ST001D01T=="11"~"Grade 11",
ST001D01T=="12"~"Grade 12")) %>%
mutate(SEX= case_when(ST004D01T=="1"~"Female",
ST004D01T=="2"~"Male")) %>%
select(SEX, GRADE_LEVEL) %>%
drop_na(SEX, GRADE_LEVEL) %>%
group_by(SEX,GRADE_LEVEL) %>%
summarise(Count=n()) %>%
ggplot(aes(x=reorder(GRADE_LEVEL,Count), y=Count, fill=SEX)) +
geom_bar(stat = "identity", position = position_dodge(), color="black") +
geom_text(aes(label=Count),vjust=0.18, hjust= -0.3, color="black",
position = position_dodge(0.8), size=2.8) +
scale_y_continuous(expand = c(0,0), limit=c(0,2100)) +
labs(x="Grade Level", y="Number of Students", title="Number of Participants According to Grade Level and Sex") +
theme_classic() +
coord_flip()
## `summarise()` has grouped output by 'SEX'. You can override using the `.groups`
## argument.
Answer: The grade level with the most and least number of students in
the sample is “Grade 9” and “Grade 12”, respectively.
#9. Generate a visualization of the frequency distribution of students who have internet access grouped according to region. In which region has the most and least number of students with internet access?
phil_pisa2018_glcm$ST011Q06TA = as.factor(phil_pisa2018$ST011Q06TA)
phil_pisa2018_glcm %>%
mutate(INTERNET_ACCESS= case_when(ST011Q06TA=="1"~"YES",
ST011Q06TA=="2"~"NO")) %>%
select(REGION_NEW, INTERNET_ACCESS) %>%
drop_na(REGION_NEW, INTERNET_ACCESS) %>%
group_by(REGION_NEW, INTERNET_ACCESS) %>%
summarise(Count=n()) %>%
ggplot(aes(x=reorder(REGION_NEW,Count), y=Count, fill=INTERNET_ACCESS)) +
geom_bar(stat = "identity", position = position_dodge(), color="black") +
geom_text(aes(label=Count),vjust=0.4, hjust= -0.3, color="black",
position = position_dodge(1), size=2.8) +
scale_y_continuous(expand = c(0,0), limit=c(0,700)) +
labs(x="REGION", y="Number of Students", title="Number of Students with Internet Access According to Region") +
theme_classic() +
coord_flip() +
scale_fill_discrete(name= "Internet Access", labels= c("YES","NO"))
## `summarise()` has grouped output by 'REGION_NEW'. You can override using the
## `.groups` argument.
Answer: The region with the most and least number of students with
internet access is “Region 4A” and “CAR”, respectively.
#10. Generate a visualization of the empirical probability distribution of MATH, READING, SCIENCE, and GLCM. Superimpose a normal curve on each plot. Describe the shape of the empirical distribution. Are the distributions normally distributed? Are there outliers?
math%>%
drop_na(MATH) %>%
ggplot(aes(MATH)) +
geom_histogram(aes(y = after_stat(density)),
colour="black",
bins = 15,
fill="light blue") +
scale_y_continuous(expand = c(0,0), limits= c(0,0.006)) +
stat_function(fun = dnorm,
args = list(mean= mean(math$MATH), sd = sd(math$MATH)), col="red") +
theme_bw()
reading%>%
drop_na(READING) %>%
ggplot(aes(READING)) +
geom_histogram(aes(y = after_stat(density)),
colour="black",
bins = 15,
fill="light blue") +
scale_y_continuous(expand = c(0,0), limits= c(0,0.006)) +
stat_function(fun = dnorm,
args = list(mean= mean(reading$READING), sd = sd(reading$READING)), col="red") +
theme_bw()
science%>%
drop_na(SCIENCE) %>%
ggplot(aes(SCIENCE)) +
geom_histogram(aes(y = after_stat(density)),
colour="black",
bins = 15,
fill="light blue") +
scale_y_continuous(expand = c(0,0), limits= c(0,0.007)) +
scale_x_continuous(expand=c(0,0)) +
stat_function(fun = dnorm,
args = list(mean= mean(science$SCIENCE), sd = sd(science$SCIENCE)), col="red") +
theme_bw()
glcm%>%
drop_na(GLCM) %>%
ggplot(aes(GLCM)) +
geom_histogram(aes(y = after_stat(density)),
colour="black",
bins = 17,
fill="light blue") +
scale_y_continuous(expand = c(0,0), limits= c(0,0.007)) +
stat_function(fun = dnorm,
args = list(mean= mean(glcm$GLCM), sd = sd(glcm$GLCM)), col="red") +
theme_bw()
Answer: The MATH data shows a normally distributed histogram while
READING,SCIENCE, AND GLCM shows a positively skewed distributuion. All
of them have no outliers detected.
#11. Generate a table of means and standard error of the means of MATH, READING, SCIENCE, and GLCM by region. Which region performs best and worst in these subjects areas?
library(knitr)
SAMPLE <- phil_pisa2018_glcm %>%
select(REGION,MATH, READING,SCIENCE,GLCM) %>%
group_by(REGION) %>%
summarise(mu_math = mean(MATH),
mu_reading = mean(READING),
mu_science = mean(SCIENCE),
mu_glcm = mean(GLCM),
se_math = sd(MATH)/sqrt(n()),
se_reading = sd(READING)/sqrt(n()),
se_science = sd(SCIENCE)/sqrt(n()),
se_glcm = sd(GLCM)/sqrt(n()))
TABLE <- SAMPLE %>%
mutate(mu_subs = rowMeans(select(SAMPLE,c(mu_math,
mu_reading,
mu_science,
mu_glcm))))
TABLE
kable(TABLE)
| REGION | mu_math | mu_reading | mu_science | mu_glcm | se_math | se_reading | se_science | se_glcm | mu_subs |
|---|---|---|---|---|---|---|---|---|---|
| Region 1 | 341.8855 | 321.0015 | 340.7932 | 353.3641 | 3.193656 | 3.309686 | 3.028230 | 3.278626 | 339.2611 |
| Region 2 | 344.6461 | 326.6504 | 346.8693 | 363.0625 | 4.168193 | 4.103345 | 3.832510 | 4.170575 | 345.3071 |
| Region 3 | 355.4606 | 339.0414 | 350.3069 | 367.9459 | 2.339399 | 2.436344 | 2.233627 | 2.350921 | 353.1887 |
| Region 4A | 365.4433 | 349.8223 | 365.8842 | 380.0412 | 1.922768 | 2.152546 | 2.006841 | 2.123175 | 365.2978 |
| Region 4B | 337.1956 | 326.5423 | 339.2683 | 354.9788 | 4.351104 | 4.286446 | 3.959615 | 4.300548 | 339.4962 |
| Region 5 | 338.2182 | 327.4742 | 345.0808 | 363.7356 | 3.361817 | 3.384346 | 3.087450 | 3.319069 | 343.6272 |
| Region 6 | 345.2129 | 333.8645 | 354.2873 | 367.2314 | 3.963424 | 4.095302 | 3.675700 | 3.838992 | 350.1490 |
| Region 7 | 364.0297 | 353.4637 | 375.3081 | 386.8256 | 3.265602 | 3.666712 | 3.148331 | 3.638791 | 369.9068 |
| Region 8 | 341.1649 | 343.5006 | 351.3861 | 370.2218 | 4.002170 | 4.255857 | 3.968101 | 4.164208 | 351.5683 |
| Region 9 | 329.9775 | 309.2361 | 331.4282 | 345.7618 | 4.315175 | 4.039748 | 3.767901 | 4.035723 | 329.1009 |
| Region 10 | 342.0046 | 330.7653 | 343.0657 | 353.5956 | 4.243819 | 4.585288 | 3.957055 | 4.768487 | 342.3578 |
| Region 11 | 342.1345 | 331.7825 | 349.3926 | 362.8933 | 3.445807 | 3.935716 | 3.555330 | 3.945300 | 346.5507 |
| Region 12 | 313.7029 | 297.4103 | 326.3249 | 339.4117 | 3.691128 | 3.466314 | 3.087471 | 3.304096 | 319.2125 |
| NCR | 384.9278 | 371.5097 | 390.4726 | 401.8871 | 2.393303 | 2.794039 | 2.449867 | 2.736760 | 387.1993 |
| CAR | 369.8061 | 354.3165 | 361.3156 | 383.1771 | 6.814103 | 6.866394 | 6.520257 | 6.998269 | 367.1538 |
| CARAGA | 305.2946 | 299.5162 | 317.1731 | 333.1910 | 4.445785 | 4.036499 | 3.637917 | 3.987865 | 313.7937 |
| NIR | 342.3126 | 335.8353 | 359.8877 | 369.5537 | 4.359274 | 4.657765 | 4.430331 | 4.569060 | 351.8973 |
Answer: The region who performs the best in these subjects is “NCR” with a grand mean of approximately 387.20 while the worst one is “CARAGA” with a grand mean of about 313.79.
#12. Generate a visualization of the means (with error bars) of MATH, READING, SCIENCE, and GLCM by sex. Are there glaring sex differences in these subjects areas?
TRY <-phil_pisa2018_glcm %>%
mutate(SEX= case_when(ST004D01T=="1"~"Female",
ST004D01T=="2"~"Male")) %>%
group_by(SEX) %>%
drop_na(MATH,READING,SCIENCE,GLCM) %>%
summarise(mean_math = round(mean(MATH),2),
mean_reading = round(mean(READING),2),
mean_science = round(mean(SCIENCE),2),
mean_glcm = round(mean(GLCM),2),
sd_math = round(sd(MATH),2),
sd_reading = round(sd(READING),2),
sd_science = round(sd(SCIENCE),2),
sd_glcm = round(sd(GLCM),2))
errorbar_math <- TRY %>%
ggplot(aes(x = SEX, y= mean_math)) +
geom_bar(fill = "light blue", stat = "identity",
position = position_dodge(),color="black") +
geom_errorbar(aes(x = SEX, ymin = mean_math-sd_math, ymax =mean_math+sd_math), width=0.1,
color = "black",
position = position_dodge(0.9)) +
scale_y_continuous(expand=c(0,0),limits=c(0,450) ) +
geom_text(aes(label=mean_math), vjust=-0.5) +
labs(x="SEX", y = "Mean of MATH") +
theme_classic()
errorbar_math
errorbar_reading <- TRY %>%
ggplot(aes(x = SEX, y= mean_reading)) +
geom_bar(fill = "light blue", stat = "identity",
position = position_dodge(),color="black") +
geom_errorbar(aes(x = SEX, ymin = mean_reading-sd_reading, ymax =mean_reading+sd_reading), width=0.1,
color = "black",
position = position_dodge(0.9)) +
scale_y_continuous(expand=c(0,0),limits=c(0,450) ) +
geom_text(aes(label=mean_reading), vjust=-0.5) +
labs(x="SEX", y = "Mean of READING") +
theme_classic()
errorbar_reading
errorbar_science <- TRY %>%
ggplot(aes(x = SEX, y= mean_science)) +
geom_bar(fill = "light blue", stat = "identity",
position = position_dodge(),color="black") +
geom_errorbar(aes(x = SEX, ymin = mean_science-sd_science, ymax =mean_science+sd_science), width=0.1,
color = "black",
position = position_dodge(0.9)) +
scale_y_continuous(expand=c(0,0),limits=c(0,450) ) +
geom_text(aes(label=mean_science), vjust=-0.5) +
labs(x="SEX", y = "Mean of SCIENCE") +
theme_classic()
errorbar_science
errorbar_glcm <- TRY %>%
ggplot(aes(x = SEX, y= mean_glcm)) +
geom_bar(fill = "light blue", stat = "identity",
position = position_dodge(),color="black") +
geom_errorbar(aes(x = SEX, ymin = mean_glcm-sd_glcm, ymax =mean_glcm+sd_glcm), width=0.1,
color = "black",
position = position_dodge(0.9)) +
scale_y_continuous(expand=c(0,0),limits=c(0,500) ) +
geom_text(aes(label=mean_glcm), vjust=-0.5) +
labs(x="SEX", y = "Mean of GLCM") +
theme_classic()
errorbar_glcm
Answer: Based on the results, we can see that there is a difference
between sexes in each subject but the difference in the GLCM is more
obvious compared to the others.y,Generally, it shows that female has
greater amount of mean per subject compared to male.
#13. Determine the percentage of students in each reading proficiency level for all regions and indicate the regional level of reading proficiency based on the above guide. Describe the data completely based on the supplementary file. Which region has the best and worst reading proficiency?
Reading_Levels <- phil_pisa2018_glcm %>%
select(REGION, READING) %>%
mutate(Reading_Level = NA %>%
replace(READING < 262,"Level 1c") %>%
replace(READING >= 262 & READING < 335,"Level 1b") %>%
replace(READING >= 335 & READING < 407,"Level 1a") %>%
replace(READING >= 407 & READING < 480,"Level 2") %>%
replace(READING >= 480 & READING < 553,"Level 3") %>%
replace(READING >= 553 & READING < 626,"Level 4") %>%
replace(READING >= 626 & READING < 698,"Level 5") %>%
replace(READING >= 698,"Level 6")) %>%
group_by(REGION, Reading_Level) %>%
summarise(count_reading = n()) %>%
mutate(frequency_reading = round(count_reading/ sum(count_reading),4)*100)
## `summarise()` has grouped output by 'REGION'. You can override using the
## `.groups` argument.
WIDE_reading<-Reading_Levels %>%
select(REGION,Reading_Level,frequency_reading) %>%
pivot_wider(names_from = Reading_Level,values_from = frequency_reading)
Reorder <-WIDE_reading <- WIDE_reading[, c(1,4,3,2,5,6,7,8)]
Reorder
Answer: The region who has the best reading proficiency based on the guide is “NCR” where around 33% of its participants has a reading proficiency of at least Level 2. The region ranked 2nd to the highest of percentage of reading proficiency level 4.It also has the least percentage of participants who has a reading proficiency level of 1c.The region with the worst reading proficiency is “Region 12” where around 95% of the participants belong to reading proficiency level 1( 1a, 1b, and 1c).Moreover, it only contributed about 1% in Level 3 which is very low when in fact,it is the region’s highest reading proficiency level.
#14. Determine the percentage of students in each mathematics proficiency level for all regions and indicate the regional level of mathematics proficiency based on the above guide. Describe the data completely based on the supplementary file. Which region has the best and worst mathematics proficiency?
Math_Levels <- phil_pisa2018_glcm %>%
select(REGION, MATH, READING) %>%
group_by(REGION) %>%
summarise(Math_Level = case_when(MATH < "420"~ "Level 1",
MATH >= "420" & MATH < "483" ~ "Level 2",
MATH >= "482" & MATH < "545" ~ "Level 3",
MATH >= "545" & MATH < "607" ~ "Level 4",
MATH >= "607" & MATH < "669" ~ "Level 5",
MATH >= "669" ~ "6")) %>%
group_by( REGION,Math_Level) %>%
summarise(count_math = n()) %>%
mutate(frequency_math = round(count_math/ sum(count_math),4)*100)
## `summarise()` has grouped output by 'REGION'. You can override using the
## `.groups` argument.
## `summarise()` has grouped output by 'REGION'. You can override using the
## `.groups` argument.
Math_Levels %>%
select(REGION,Math_Level,frequency_math) %>%
pivot_wider(values_from = frequency_math, names_from = Math_Level)
Answer: The region with the best Mathematics proficiency is “NCR” where it has the least percentage of participants who have Mathematics proficiency at Level 1 with just about 68% compared to mostly of the other regions with percentage of around 80% and above. In addition, it has the highest percentage of participants who are in Levels 3 and 4.In contrast, “Region 12” and “CARAGA” have the worst Mathematics proficiency with about 95% of their participants have Level 1 Mathematics proficiency and their highest proficiency level is just Level 3 with less than 1% of their participants .
#15. Which grade level and sex has the highest and lowest proficiency levels in reading and mathematics?
phil_pisa2018_glcm$ST001D01T= as.factor(phil_pisa2018$ST001D01T)
phil_pisa2018$ST004D01T = as.factor(phil_pisa2018$ST004D01T)
math_and_reading <- phil_pisa2018_glcm %>%
mutate(GRADE_LEVEL= case_when(ST001D01T=="7"~"Grade 7",
ST001D01T=="8"~"Grade 8",
ST001D01T=="9"~"Grade 9",
ST001D01T=="10"~"Grade 10",
ST001D01T=="11"~"Grade 11",
ST001D01T=="12"~"Grade 12")) %>%
mutate(SEX= case_when(ST004D01T=="1"~"Female",
ST004D01T=="2"~"Male")) %>%
select(GRADE_LEVEL,SEX,MATH,READING) %>%
drop_na(GRADE_LEVEL,SEX,MATH,READING) %>%
group_by(GRADE_LEVEL,SEX,MATH,READING)
math_and_reading
Summary1 <- math_and_reading %>%
select(GRADE_LEVEL,SEX,MATH,) %>%
summarise(Math_Level = case_when(MATH < "420"~ "Level 1",
MATH >= "420" & MATH < "483" ~ "Level 2",
MATH >= "482" & MATH < "545" ~ "Level 3",
MATH >= "545" & MATH < "607" ~ "Level 4",
MATH >= "607" & MATH < "669" ~ "Level 5",
MATH >= "669" ~ "6")) %>%
group_by(Math_Level,GRADE_LEVEL,SEX) %>%
summarise(count_math = n())
## Adding missing grouping variables: `READING`
## `summarise()` has grouped output by 'GRADE_LEVEL', 'SEX', 'MATH'. You can
## override using the `.groups` argument.
## `summarise()` has grouped output by 'Math_Level', 'GRADE_LEVEL'. You can
## override using the `.groups` argument.
Summary1
SUMMARY2 <-math_and_reading %>%
select(GRADE_LEVEL,SEX,READING) %>%
mutate(Reading_Level = NA %>%
replace(READING < 262,"Level 1c") %>%
replace(READING >= 262 & READING < 335,"Level 1b") %>%
replace(READING >= 335 & READING < 407,"Level 1a") %>%
replace(READING >= 407 & READING < 480,"Level 2") %>%
replace(READING >= 480 & READING < 553,"Level 3") %>%
replace(READING >= 553 & READING < 626,"Level 4") %>%
replace(READING >= 626 & READING < 698,"Level 5") %>%
replace(READING >= 698,"Level 6")) %>%
group_by(Reading_Level,GRADE_LEVEL,SEX) %>%
summarise(count_reading = n())
## Adding missing grouping variables: `MATH`
## `summarise()` has grouped output by 'Reading_Level', 'GRADE_LEVEL'. You can
## override using the `.groups` argument.
SUMMARY2
Answer:In Mathematics, Grades 9 and 10 have the highest proficiency level of 5 and both are only males.They are also the ones who have least proficiency level in the subject but females are now included.In Reading,only Grade 10 students has representatives who have proficiency level of 5 for both sexes.It is also one of the contributors to have students with proficiency level of 1 for both sexes as well. In conclusion, Grade 10 male students is the group who has the highest and lowest proficiency levels in both subjects.
#16. Is there a relationship betweeen MATH,READING,SCIENCE, and GLCM? Is this relationship the same for Grade 9 and Grade 10 students? for male and female students?
subjects <- phil_pisa2018_glcm %>%
select(MATH,READING,SCIENCE,GLCM) %>%
cor()
subjects
## MATH READING SCIENCE GLCM
## MATH 1.0000000 0.9041794 0.8776850 0.8398788
## READING 0.9041794 1.0000000 0.9318180 0.9192905
## SCIENCE 0.8776850 0.9318180 1.0000000 0.8857789
## GLCM 0.8398788 0.9192905 0.8857789 1.0000000
GRADE <- phil_pisa2018_glcm %>%
select(MATH,READING,SCIENCE,GLCM,ST001D01T) %>%
mutate(GRADE_LEVEL1=NA %>%
replace(ST001D01T=="9","Grade 9") %>%
replace(ST001D01T=="10","Grade 10")) %>%
group_by(GRADE_LEVEL1)
GRADE
ONE<-GRADE %>%
group_by(GRADE_LEVEL1) %>%
drop_na(GRADE_LEVEL1) %>%
summarise("COR M&R"=cor(MATH,READING))
ONE
TWO<-GRADE %>%
group_by(GRADE_LEVEL1) %>%
drop_na(GRADE_LEVEL1) %>%
summarise("COR M&S"=cor(MATH,SCIENCE))
TWO
THREE<-GRADE %>%
group_by(GRADE_LEVEL1) %>%
drop_na(GRADE_LEVEL1) %>%
summarise("COR M&G"=cor(MATH,GLCM))
THREE
FOUR<-GRADE %>%
group_by(GRADE_LEVEL1) %>%
drop_na(GRADE_LEVEL1) %>%
summarise("COR R&S"=cor(READING,SCIENCE))
FOUR
FIVE<-GRADE %>%
group_by(GRADE_LEVEL1) %>%
drop_na(GRADE_LEVEL1) %>%
summarise("COR R$G"=cor(READING,GLCM))
FIVE
SIX<-GRADE %>%
group_by(GRADE_LEVEL1) %>%
drop_na(GRADE_LEVEL1) %>%
summarise("COR S&G"=cor(SCIENCE,GLCM))
SIX
GENDER<-phil_pisa2018_glcm %>%
select(MATH,READING,SCIENCE,GLCM,ST004D01T) %>%
mutate(SEX= case_when(ST004D01T=="1"~"Female",
ST004D01T=="2"~"Male")) %>%
group_by(SEX)
GENDER
FIRST<-GENDER %>%
group_by(SEX) %>%
drop_na(SEX) %>%
summarise("COR M&R"=cor(MATH,READING))
FIRST
SECOND<-GENDER %>%
group_by(SEX) %>%
drop_na(SEX) %>%
summarise("COR M&S"=cor(MATH,SCIENCE))
SECOND
THIRD<-FIRST<-GENDER %>%
group_by(SEX) %>%
drop_na(SEX) %>%
summarise("COR M&G"=cor(MATH,GLCM))
THIRD
FOURTH<-FIRST<-GENDER %>%
group_by(SEX) %>%
drop_na(SEX) %>%
summarise("COR R&S"=cor(READING,SCIENCE))
FOURTH
FIFTH<-FIRST<-GENDER %>%
group_by(SEX) %>%
drop_na(SEX) %>%
summarise("COR R&G"=cor(READING,GLCM))
FIFTH
SIXTH<-FIRST<-GENDER %>%
group_by(SEX) %>%
drop_na(SEX) %>%
summarise("COR S&G"=cor(SCIENCE,GLCM))
SIXTH
Answer: The subjects has high positive relationship with each other where it gives a value of 0.80 and above. As shown in the table, there is still a high positive relationship for the Gender and Grade Level specified. They may not give exact same value, they are still close to each other and will lead to the same conclusion. This means that they have a direct relationship where variables will increase or decrease together.
#17. Provide concluding remarks based on your exploration of the data set. Give the important highlights.
Answer: Based on the exploration of the data, the information taken were from the Philippines and it is evident because of the 17 regions mentioned. In terms of participant/students, it is evident on the data that there are more female compared to male in every region. The difference between sexes in each subject is not that high or obvious except for the GLCM where the difference is around 30.The best and worst performing regions in the subjects Math and Reading is NCR, Region 12 and CARAGA , respectively. Generally, we can say that the group of grade 10 male students has both the highest and lowest proficiency levels in Mathematics and Reading.In conclusion, there are only few regions which have high understanding to Mathematics and Reading based on the proficiency level guide.