Challenge 7

Author

Jingyi Yang

1. Start Up

knitr::opts_chunk$set(echo = TRUE, warning=FALSE, message=FALSE)
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.4.4     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.0
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(readr)
library(readxl)
library(stringr)
library(dplyr)
library(lubridate)
library("khroma")
library(here)
here() starts at C:/8-601
library("ggthemes")

2. Import the data

2.1 Import the “Public School” data set

setwd("C:\\8-601\\challenge_datasets")
Public_School<- read_csv("Public_School_Characteristics_2017-18.csv")

Public_School %>% print(n = 10, width = Inf)
# A tibble: 100,729 × 79
       X     Y OBJECTID NCESSCH      NMCNTY                            SURVYEAR 
   <dbl> <dbl>    <dbl> <chr>        <chr>                             <chr>    
 1 -149.  61.6        1 020051000480 Matanuska-Susitna Borough         2017-2018
 2 -157.  71.3        2 020061000470 North Slope Borough               2017-2018
 3 -151.  60.5        3 020039000448 Kenai Peninsula Borough           2017-2018
 4 -151.  60.6        4 020039000463 Kenai Peninsula Borough           2017-2018
 5 -151.  60.6        5 020039000513 Kenai Peninsula Borough           2017-2018
 6 -133.  56.1        6 020070000526 Prince of Wales-Hyder Census Area 2017-2018
 7 -135.  57.5        7 020073000477 Hoonah-Angoon Census Area         2017-2018
 8 -149.  63.9        8 020077000447 Denali Borough                    2017-2018
 9 -166.  54.1        9 020000700004 Aleutians East Borough            2017-2018
10 -163.  54.9       10 020000700007 Aleutians East Borough            2017-2018
   STABR LEAID   ST_LEAID LEA_NAME                                 
   <chr> <chr>   <chr>    <chr>                                    
 1 AK    0200510 AK-33    Matanuska-Susitna Borough School District
 2 AK    0200610 AK-36    North Slope Borough School District      
 3 AK    0200390 AK-24    Kenai Peninsula Borough School District  
 4 AK    0200390 AK-24    Kenai Peninsula Borough School District  
 5 AK    0200390 AK-24    Kenai Peninsula Borough School District  
 6 AK    0200700 AK-44    Southeast Island School District         
 7 AK    0200730 AK-09    Chatham School District                  
 8 AK    0200770 AK-02    Denali Borough School District           
 9 AK    0200007 AK-56    Aleutians East Borough School District   
10 AK    0200007 AK-56    Aleutians East Borough School District   
   SCH_NAME                              LSTREET1               LSTREET2
   <chr>                                 <chr>                  <chr>   
 1 John Shaw Elementary                  3750 E Paradise Ln     <NA>    
 2 Kiita Learning Community              5246 Karluk St         <NA>    
 3 Soldotna Montessori Charter School    158 E Park Ave         <NA>    
 4 Kaleidoscope School of Arts & Science 549 N Forest Dr        <NA>    
 5 Marathon School                       405 Marathon Rd        <NA>    
 6 Whale Pass School                     126 Bayview Rd         <NA>    
 7 Chatham Correspondence                500 Big Dog Salmon Way <NA>    
 8 Denali PEAK                           1 Suntrana St          <NA>    
 9 Akutan School                         202 Volcano Dr         <NA>    
10 False Pass School                     300 Valley Rd          <NA>    
   LSTREET3 LCITY      LSTATE LZIP  LZIP4 PHONE         GSLO  GSHI 
   <lgl>    <chr>      <chr>  <chr> <chr> <chr>         <chr> <chr>
 1 NA       Wasilla    AK     99654 <NA>  (907)352-0500 PK    05   
 2 NA       Utqiagvik  AK     99723 <NA>  (907)852-9677 09    12   
 3 NA       Soldotna   AK     99669 <NA>  (907)260-9221 KG    06   
 4 NA       Kenai      AK     99611 <NA>  (907)283-0804 KG    05   
 5 NA       Kenai      AK     99611 <NA>  (907)335-3343 07    12   
 6 NA       Whale Pass AK     99950 <NA>  (907)846-5320 PK    12   
 7 NA       Angoon     AK     99820 <NA>  (907)788-3302 KG    12   
 8 NA       Healy      AK     99743 <NA>  (907)683-2278 PK    12   
 9 NA       Akutan     AK     99553 <NA>  (907)698-2205 PK    12   
10 NA       False Pass AK     99583 <NA>  (907)548-2224 PK    12   
   VIRTUAL              TOTFRL FRELCH REDLCH    PK    KG   G01   G02   G03   G04
   <chr>                 <dbl>  <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
 1 Not a virtual school    183    158     25    30    81    63    80    62    58
 2 Not a virtual school     27     27      0    NA    NA    NA    NA    NA    NA
 3 Not a virtual school     43     23     20    NA    23    23    27    22    25
 4 Not a virtual school     69     50     19    NA    40    43    42    46    46
 5 Not a virtual school     -9     -9     -9    NA    NA    NA    NA    NA    NA
 6 Not a virtual school     17     17      0     0     0     3     1     2     2
 7 Not a virtual school      3     -1     -1    NA     2     2     1     1     1
 8 Not a virtual school      3     -1     -1    42    40    44    56    59    61
 9 Not a virtual school      3     -1     -1     0     4     0     3     1     1
10 Not a virtual school     -9     -9     -9     0     0     0     1     1     0
     G05   G06   G07   G08   G09   G10   G11   G12 G13   TOTAL MEMBER    AM
   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <lgl> <dbl>  <dbl> <dbl>
 1    73    NA    NA    NA    NA    NA    NA    NA NA      447    447    50
 2    NA    NA    NA    NA     0     3     7    20 NA       30     30    27
 3    28    19    NA    NA    NA    NA    NA    NA NA      167    167     8
 4    43    NA    NA    NA    NA    NA    NA    NA NA      260    260    16
 5    NA    NA     0     1     1     2     1     0 NA        5      5     0
 6     2     1     5     1     0     0     0     1 NA       18     18     0
 7     0     0     0     0     1     2     0     1 NA       11     11     2
 8    59    54    55    74    47    51    48    47 NA      737    737    53
 9     0     2     0     0     0     0     1     1 NA       13     13    11
10     1     0     1     1     1     0     0     0 NA        6      6     4
      HI    BL    WH    HP    TR    FTE LATCOD LONCOD ULOCALE         
   <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>  <dbl>  <dbl> <chr>           
 1    12     5   351     2    23 24.9     61.6  -149. 41-Rural: Fringe
 2     0     0     0     1     2  3       71.3  -157. 33-Town: Remote 
 3     5     0   136     0    15 10.4     60.5  -151. 33-Town: Remote 
 4    14     3   168     0    56 16.8     60.6  -151. 33-Town: Remote 
 5     0     1     3     1     0  0.670   60.6  -151. 33-Town: Remote 
 6     1     0    13     0     4  1.90    56.1  -133. 43-Rural: Remote
 7     0     5     4     0     0  0       57.5  -135. 43-Rural: Remote
 8    76    39   443     8    97  5.79    63.9  -149. 43-Rural: Remote
 9     0     0     1     0     1  1.96    54.1  -166. 43-Rural: Remote
10     0     0     2     0     0  1.39    54.9  -163. 43-Rural: Remote
   STUTERATIO STITLEI        AMALM AMALF ASALM ASALF HIALM HIALF BLALM BLALF
        <dbl> <chr>          <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
 1      18.0  Yes               33    17     1     3    10     2     3     2
 2      10    Not Applicable    16    11     0     0     0     0     0     0
 3      16.1  Not Applicable     4     4     0     3     2     3     0     0
 4      15.5  Not Applicable    10     6     1     2     6     8     3     0
 5       7.46 Yes                0     0     0     0     0     0     0     1
 6       9.47 Yes                0     0     0     0     1     0     0     0
 7      NA    Not Applicable     1     1     0     0     0     0     3     2
 8     127.   Not Applicable    21    32    13     8    33    43    20    19
 9       6.63 Yes                6     5     0     0     0     0     0     0
10       4.32 Yes                2     2     0     0     0     0     0     0
   WHALM WHALF HPALM HPALF TRALM TRALF TOTMENROL TOTFENROL STATUS    UG AE   
   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>     <dbl>     <dbl>  <dbl> <dbl> <lgl>
 1   193   158     0     2    11    12       251       196      1    NA NA   
 2     0     0     1     0     1     1        18        12      1    NA NA   
 3    58    78     0     0     7     8        71        96      1    NA NA   
 4    82    86     0     0    26    30       128       132      1    NA NA   
 5     1     2     0     1     0     0         1         4      1    NA NA   
 6     5     8     0     0     4     0        10         8      1    NA NA   
 7     1     3     0     0     0     0         5         6      1    NA NA   
 8   221   222     4     4    48    49       360       377      1    NA NA   
 9     0     1     0     0     1     0         7         6      1    NA NA   
10     1     1     0     0     0     0         3         3      1    NA NA   
   SCHOOL_TYPE_TEXT         SY_STATUS_TEXT        SCHOOL_LEVEL    AS
   <chr>                    <chr>                 <chr>        <dbl>
 1 Regular school           Currently operational Elementary       4
 2 Alternative/other school Currently operational High             0
 3 Regular school           Currently operational Elementary       3
 4 Regular school           Currently operational Elementary       3
 5 Alternative/other school Currently operational High             0
 6 Regular school           Currently operational Other            0
 7 Regular school           Currently operational Other            0
 8 Regular school           Currently operational Other           21
 9 Regular school           Currently operational Other            0
10 Regular school           Currently operational Other            0
   CHARTER_TEXT MAGNET_TEXT
   <chr>        <chr>      
 1 No           No         
 2 No           No         
 3 Yes          No         
 4 Yes          No         
 5 No           No         
 6 No           No         
 7 No           No         
 8 No           No         
 9 No           No         
10 No           No         
# ℹ 100,719 more rows

2.2 Import the “Marriage Law Vote” data set

setwd("C:\\8-601\\challenge_datasets")

law_vote <- read_excel("australian_marriage_law_postal_survey_2017_-_response_final.xls",
           sheet="Table 2",
           skip=6, col_names = c("District", "Response clear_yes_number", "Response clear_yes_percentage", "Response clear_no_number", "Response clear_no_percentage", "Response clear_total_number","Response clear_total_percentage","delete", "Eligible Participants_Response clear_number","Eligible Participants_Response clear_percentage", "Eligible Participants_Response not clear(b)_number", "Eligible Participants_Response not clear(b)_percentage","Eligible Participants_Non-responding_number", "Eligible Participants_Non-responding_percentage", "Eligible Participants_total_number", "Eligible Participants_total_percentage"))%>%
  drop_na(District)%>%
  filter(!str_detect(District, "(Total)"))%>%
  slice(-c(160, 161,162,163,164,165))


law_vote %>% print(n = 10, width = Inf)
# A tibble: 159 × 16
   District                  `Response clear_yes_number`
   <chr>                     <chr>                      
 1 New South Wales Divisions <NA>                       
 2 Banks                     37736                      
 3 Barton                    37153                      
 4 Bennelong                 42943                      
 5 Berowra                   48471                      
 6 Blaxland                  20406                      
 7 Bradfield                 53681                      
 8 Calare                    54091                      
 9 Chifley                   32871                      
10 Cook                      47505                      
   `Response clear_yes_percentage` `Response clear_no_number`
   <chr>                           <chr>                     
 1 <NA>                            <NA>                      
 2 44.899999999999999              46343                     
 3 43.600000000000001              47984                     
 4 49.799999999999997              43215                     
 5 54.600000000000001              40369                     
 6 26.100000000000001              57926                     
 7 60.600000000000001              34927                     
 8 60.200000000000003              35779                     
 9 41.299999999999997              46702                     
10 55                              38804                     
   `Response clear_no_percentage` `Response clear_total_number`
   <chr>                          <chr>                        
 1 <NA>                           <NA>                         
 2 55.100000000000001             84079                        
 3 56.399999999999999             85137                        
 4 50.200000000000003             86158                        
 5 45.399999999999999             88840                        
 6 73.900000000000006             78332                        
 7 39.399999999999999             88608                        
 8 39.799999999999997             89870                        
 9 58.700000000000003             79573                        
10 45                             86309                        
   `Response clear_total_percentage` delete
   <chr>                             <lgl> 
 1 <NA>                              NA    
 2 100                               NA    
 3 100                               NA    
 4 100                               NA    
 5 100                               NA    
 6 100                               NA    
 7 100                               NA    
 8 100                               NA    
 9 100                               NA    
10 100                               NA    
   `Eligible Participants_Response clear_number`
   <chr>                                        
 1 <NA>                                         
 2 84079                                        
 3 85137                                        
 4 86158                                        
 5 88840                                        
 6 78332                                        
 7 88608                                        
 8 89870                                        
 9 79573                                        
10 86309                                        
   `Eligible Participants_Response clear_percentage`
   <chr>                                            
 1 <NA>                                             
 2 79.900000000000006                               
 3 77.799999999999997                               
 4 81                                               
 5 84.5                                             
 6 75                                               
 7 83.5                                             
 8 77.799999999999997                               
 9 73.700000000000003                               
10 82                                               
   `Eligible Participants_Response not clear(b)_number`
   <chr>                                               
 1 <NA>                                                
 2 247                                                 
 3 226                                                 
 4 244                                                 
 5 212                                                 
 6 220                                                 
 7 202                                                 
 8 285                                                 
 9 263                                                 
10 229                                                 
   `Eligible Participants_Response not clear(b)_percentage`
   <chr>                                                   
 1 <NA>                                                    
 2 0.20000000000000001                                     
 3 0.20000000000000001                                     
 4 0.20000000000000001                                     
 5 0.20000000000000001                                     
 6 0.20000000000000001                                     
 7 0.20000000000000001                                     
 8 0.20000000000000001                                     
 9 0.20000000000000001                                     
10 0.20000000000000001                                     
   `Eligible Participants_Non-responding_number`
   <chr>                                        
 1 <NA>                                         
 2 20928                                        
 3 24008                                        
 4 19973                                        
 5 16038                                        
 6 25883                                        
 7 17261                                        
 8 25342                                        
 9 28180                                        
10 18713                                        
   `Eligible Participants_Non-responding_percentage`
   <chr>                                            
 1 <NA>                                             
 2 19.899999999999999                               
 3 22                                               
 4 18.800000000000001                               
 5 15.300000000000001                               
 6 24.800000000000001                               
 7 16.300000000000001                               
 8 21.899999999999999                               
 9 26.100000000000001                               
10 17.800000000000001                               
   `Eligible Participants_total_number` `Eligible Participants_total_percentage`
   <chr>                                <chr>                                   
 1 <NA>                                 <NA>                                    
 2 105254                               100                                     
 3 109371                               100                                     
 4 106375                               100                                     
 5 105090                               100                                     
 6 104435                               100                                     
 7 106071                               100                                     
 8 115497                               100                                     
 9 108016                               100                                     
10 105251                               100                                     
# ℹ 149 more rows

3. Clean the data set

3.1 Clean the “Public School” data set

I left the columns I needed for later data analysis and visualization, renamed them to clarify what information the column would contain, and filtered some outrange values.

Public_School_clean <- Public_School %>%
  select (SCH_NAME, LSTATE, FTE, STUTERATIO, SCHOOL_TYPE_TEXT, SCHOOL_LEVEL)%>%
rename("School Name"=SCH_NAME, "States"= LSTATE, "Full Time Enrollment Rate"=FTE, "Student to Teacher Ratio"= STUTERATIO, "School Types"= SCHOOL_TYPE_TEXT, "School Level"=SCHOOL_LEVEL)

quantile(Public_School_clean$`Full Time Enrollment Rate`, probs = seq(0,1,.05), na.rm=T)
     0%      5%     10%     15%     20%     25%     30%     35%     40%     45% 
   0.00    3.50    8.00   11.38   14.58   17.00   19.40   21.50   23.50   25.50 
    50%     55%     60%     65%     70%     75%     80%     85%     90%     95% 
  27.60   29.80   32.00   34.67   37.60   41.00   45.17   51.05   60.10   79.39 
   100% 
1419.00 
quantile(Public_School_clean$`Student to Teacher Ratio`, probs = seq(0,1,.05), na.rm=T)
      0%       5%      10%      15%      20%      25%      30%      35% 
    0.00     8.02    10.33    11.43    12.21    12.85    13.41    13.94 
     40%      45%      50%      55%      60%      65%      70%      75% 
   14.41    14.87    15.33    15.80    16.30    16.83    17.45    18.18 
     80%      85%      90%      95%     100% 
   19.19    20.54    22.50    25.11 22350.00 
Public_School_clean <- Public_School_clean %>%
  filter(`Full Time Enrollment Rate`<85 &`Student to Teacher Ratio`<30)

Public_School_clean  %>% print(n = 10, width = Inf)
# A tibble: 88,423 × 6
   `School Name`                         States `Full Time Enrollment Rate`
   <chr>                                 <chr>                        <dbl>
 1 John Shaw Elementary                  AK                          24.9  
 2 Kiita Learning Community              AK                           3    
 3 Soldotna Montessori Charter School    AK                          10.4  
 4 Kaleidoscope School of Arts & Science AK                          16.8  
 5 Marathon School                       AK                           0.670
 6 Whale Pass School                     AK                           1.90 
 7 Akutan School                         AK                           1.96 
 8 False Pass School                     AK                           1.39 
 9 King Cove School                      AK                          12    
10 Sand Point School                     AK                          12.9  
   `Student to Teacher Ratio` `School Types`           `School Level`
                        <dbl> <chr>                    <chr>         
 1                      18.0  Regular school           Elementary    
 2                      10    Alternative/other school High          
 3                      16.1  Regular school           Elementary    
 4                      15.5  Regular school           Elementary    
 5                       7.46 Alternative/other school High          
 6                       9.47 Regular school           Other         
 7                       6.63 Regular school           Other         
 8                       4.32 Regular school           Other         
 9                       7.83 Regular school           Other         
10                       9.27 Regular school           Other         
# ℹ 88,413 more rows

3.2 Clean the “Marriage Law Vote” data set

I put the division information in the “District” column into a new column and cleaned the rows that did not contain any information.

law_vote_clean <- law_vote %>%
  mutate(Division = case_when(
    str_ends(District, "Divisions") ~ District,
    TRUE ~ NA_character_ ))%>%
  fill(Division, .direction = "down")%>%
  select(-delete)%>%
  na.omit()

law_vote_clean  %>% print(n = 10, width = Inf)
# A tibble: 150 × 16
   District  `Response clear_yes_number` `Response clear_yes_percentage`
   <chr>     <chr>                       <chr>                          
 1 Banks     37736                       44.899999999999999             
 2 Barton    37153                       43.600000000000001             
 3 Bennelong 42943                       49.799999999999997             
 4 Berowra   48471                       54.600000000000001             
 5 Blaxland  20406                       26.100000000000001             
 6 Bradfield 53681                       60.600000000000001             
 7 Calare    54091                       60.200000000000003             
 8 Chifley   32871                       41.299999999999997             
 9 Cook      47505                       55                             
10 Cowper    57493                       60                             
   `Response clear_no_number` `Response clear_no_percentage`
   <chr>                      <chr>                         
 1 46343                      55.100000000000001            
 2 47984                      56.399999999999999            
 3 43215                      50.200000000000003            
 4 40369                      45.399999999999999            
 5 57926                      73.900000000000006            
 6 34927                      39.399999999999999            
 7 35779                      39.799999999999997            
 8 46702                      58.700000000000003            
 9 38804                      45                            
10 38317                      40                            
   `Response clear_total_number` `Response clear_total_percentage`
   <chr>                         <chr>                            
 1 84079                         100                              
 2 85137                         100                              
 3 86158                         100                              
 4 88840                         100                              
 5 78332                         100                              
 6 88608                         100                              
 7 89870                         100                              
 8 79573                         100                              
 9 86309                         100                              
10 95810                         100                              
   `Eligible Participants_Response clear_number`
   <chr>                                        
 1 84079                                        
 2 85137                                        
 3 86158                                        
 4 88840                                        
 5 78332                                        
 6 88608                                        
 7 89870                                        
 8 79573                                        
 9 86309                                        
10 95810                                        
   `Eligible Participants_Response clear_percentage`
   <chr>                                            
 1 79.900000000000006                               
 2 77.799999999999997                               
 3 81                                               
 4 84.5                                             
 5 75                                               
 6 83.5                                             
 7 77.799999999999997                               
 8 73.700000000000003                               
 9 82                                               
10 79                                               
   `Eligible Participants_Response not clear(b)_number`
   <chr>                                               
 1 247                                                 
 2 226                                                 
 3 244                                                 
 4 212                                                 
 5 220                                                 
 6 202                                                 
 7 285                                                 
 8 263                                                 
 9 229                                                 
10 315                                                 
   `Eligible Participants_Response not clear(b)_percentage`
   <chr>                                                   
 1 0.20000000000000001                                     
 2 0.20000000000000001                                     
 3 0.20000000000000001                                     
 4 0.20000000000000001                                     
 5 0.20000000000000001                                     
 6 0.20000000000000001                                     
 7 0.20000000000000001                                     
 8 0.20000000000000001                                     
 9 0.20000000000000001                                     
10 0.29999999999999999                                     
   `Eligible Participants_Non-responding_number`
   <chr>                                        
 1 20928                                        
 2 24008                                        
 3 19973                                        
 4 16038                                        
 5 25883                                        
 6 17261                                        
 7 25342                                        
 8 28180                                        
 9 18713                                        
10 25197                                        
   `Eligible Participants_Non-responding_percentage`
   <chr>                                            
 1 19.899999999999999                               
 2 22                                               
 3 18.800000000000001                               
 4 15.300000000000001                               
 5 24.800000000000001                               
 6 16.300000000000001                               
 7 21.899999999999999                               
 8 26.100000000000001                               
 9 17.800000000000001                               
10 20.800000000000001                               
   `Eligible Participants_total_number` `Eligible Participants_total_percentage`
   <chr>                                <chr>                                   
 1 105254                               100                                     
 2 109371                               100                                     
 3 106375                               100                                     
 4 105090                               100                                     
 5 104435                               100                                     
 6 106071                               100                                     
 7 115497                               100                                     
 8 108016                               100                                     
 9 105251                               100                                     
10 121322                               100                                     
   Division                 
   <chr>                    
 1 New South Wales Divisions
 2 New South Wales Divisions
 3 New South Wales Divisions
 4 New South Wales Divisions
 5 New South Wales Divisions
 6 New South Wales Divisions
 7 New South Wales Divisions
 8 New South Wales Divisions
 9 New South Wales Divisions
10 New South Wales Divisions
# ℹ 140 more rows

4. Using ggplot functionality

4.1 Visualize the “Public School” data set

4.1.1 ‘Full Time Enrollment Rate’ and ‘Student to Teacher Ratio’

I use the “geom_point()” function to visualize the relationship between “Full Time Enrollment Rate” and “Student to Teacher Ratio” as these variables are numerical.

To make the graphic more reader-friendly, I used

  1. the “col=” argument to make the graphic more colorful and easier to understand;

  2. the “facet_wrap()” argument to make the graphic include the third variable, “School Types,” and divide a large amount of information from the two numerical variables into smaller pieces;

  3. the “alpha=” argument to make each point more transparent, which can make the points overlap with others more obvious;

  4. the “scale” argument to adjust the axis range;

  5. the “theme()” argument to center the title and change the legend’s position from the right to the bottom of the graphic;

  6. the “ggthemes::theme_few()” to clean up the background, which makes the graphic more nitty and might help the reader focus more on the useful information.

Public_School_clean %>%
ggplot(aes(y= `Full Time Enrollment Rate`, na.rm=TRUE, x= `Student to Teacher Ratio`,na.rm=TRUE, shape=`School Types`, col=`School Types`))+
  geom_point(alpha=.5)+
  scale_x_continuous(limits= range(Public_School_clean$`Student to Teacher Ratio`))+
  scale_y_continuous(limits= range(Public_School_clean$`Full Time Enrollment Rate`))+
  facet_wrap(vars(`School Types`), scales="free")+
  ggthemes::theme_few()+
  labs(title = "'Full Time Enrollment Rate' And 'Student to Teacher Ratio'")+
   theme(plot.title = element_text(hjust=0.5))+
  theme(legend.position = "bottom")

4.1.2 Percentage of school for various states

I created a bar chart to present the information about the percentage of schools for each state. This is because this graph type is suitable for visualizing categorical and numerical variables.

To make the graphic more reader-friendly, I used

  1. the “fill=” argument to make the graphic more colorful;

  2. the “scale” argument to adjust the label and name of the “y” axis;

  3. the “theme()” argument to center the title and change the legend’s position from the right to the bottom of the graphic;

  4. the “ggthemes::theme_few()” to clean up the background, which makes the graphic more nitty and might help the reader focus more on the useful information.

Public_School_clean%>%
  group_by(States)%>%
  summarise(Count = n())%>%
  ungroup()%>%
  mutate(perc = Count/sum(Count)) %>%
  ggplot(aes(x=States, y=perc, fill=States))+
  geom_col()+
  scale_y_continuous(label = scales::percent, name = "Percentage of States")+
  ggthemes::theme_few()+
  labs(title = "Percentage Of School In Various States")+
   theme(plot.title = element_text(hjust=0.5))+
  theme(axis.text.x = element_text(angle=90))+
  theme(legend.position = "bottom")

4.1.3 Percentage and toal number for various school levels

I created a bar chart to show the information about the number and percentage of different school levels. This is because this graph type is suitable for visualizing categorical and numerical variables.

To make the graphic more reader-friendly, I used

  1. the “fill=” argument to make the graphic more colorful;

  2. the “scale” argument to adjust the range, label, and name of the “y” axis;

  3. the “theme()” argument to center the title and change the legend’s position from the right to the bottom of the graphic;

  4. the “ggthemes::theme_few()” to clean up the background, which makes the graphic more nitty and might help the reader focus more on the useful information;

  5. the “geom_text()” function to label the total number of various school levels.

Public_School_clean%>%
  group_by(`School Level`)%>%
  summarise(Count = n())%>%
  ungroup()%>%
  mutate(perc = Count/sum(Count)) %>%
  ggplot(aes(x=`School Level`, y=perc, fill=`School Level`))+
  geom_col()+
  scale_y_continuous(limits= range(0,1),label = scales::percent, n.breaks = 10, name = "Percentage of School Level")+
  geom_text(aes(label = Count), size=3, vjust=-.5)+
  ggthemes::theme_few()+
  labs(title = "Total Number And Percentage For School Level ")+
   theme(plot.title = element_text(hjust=0.5))+
  theme(axis.text.x = element_text(angle=90))+
  theme(legend.position = "bottom")

4.1.4 Full-time enrollment rate for Various shcool levels

I created a bar chart to show the full-time enrollment rate at different school levels. This is because this graph type is suitable for visualizing categorical and numerical variables.

To make the graphic more reader-friendly, I used

  1. the “fill=” argument to make the graphic more colorful;

  2. the “scale” argument to adjust the range of the “y” axis;

  3. the position = "dodge" argument to make the bars side by side (which makes the third variable, “School Types,” more clear to show in the graphic);

  4. the “theme()” argument to center the title and change the legend’s position from the right to the bottom of the graphic;

  5. the “ggthemes::theme_few()” to clean up the background, which makes the graphic more nitty and might help the reader focus more on the useful information.

  Public_School_clean%>%
    ggplot(aes(y=`Full Time Enrollment Rate`, x=`School Level`, fill=`School Types`))+
  geom_col(position = "dodge")+
    scale_y_continuous(limits = range(0,100))+
  ggthemes::theme_few()+
  labs(title = "Full Time Enrollment Rate For Various Shcool Level")+
  theme(plot.title = element_text(hjust=0.5))+
  theme(legend.position = "bottom")+
    theme(axis.text.x = element_text(angle=90))

4.2 Visualize the “Marriage Law Vote” data set

Following analysis mainly regard the people who are eligible for voting and their voting results.

law_vote_Eligible <- law_vote_clean %>%
  select(Division,District,starts_with("Eligible Participants")) %>%
  rename(Yes=`Eligible Participants_Response clear_number`, No=`Eligible Participants_Response not clear(b)_number`, `No Res.`=`Eligible Participants_Non-responding_number`) %>%
  pivot_longer(col= c(`Yes`, `No`,`No Res.`), names_to= "Status", values_to = "Values")%>%
  select( - contains("percentage"))%>%
  mutate(Status = as_factor(Status),
         Status = fct_relevel(Status, "Yes", "No","No Res")) %>%
  mutate(`Eligible Participants_total_number`= as.numeric(`Eligible Participants_total_number`))%>%
  mutate(Values= as.numeric(Values))

law_vote_Eligible  %>% print(n = 10, width = Inf)
# A tibble: 450 × 5
   Division                  District  `Eligible Participants_total_number`
   <chr>                     <chr>                                    <dbl>
 1 New South Wales Divisions Banks                                   105254
 2 New South Wales Divisions Banks                                   105254
 3 New South Wales Divisions Banks                                   105254
 4 New South Wales Divisions Barton                                  109371
 5 New South Wales Divisions Barton                                  109371
 6 New South Wales Divisions Barton                                  109371
 7 New South Wales Divisions Bennelong                               106375
 8 New South Wales Divisions Bennelong                               106375
 9 New South Wales Divisions Bennelong                               106375
10 New South Wales Divisions Berowra                                 105090
   Status  Values
   <fct>    <dbl>
 1 Yes      84079
 2 No         247
 3 No Res.  20928
 4 Yes      85137
 5 No         226
 6 No Res.  24008
 7 Yes      86158
 8 No         244
 9 No Res.  19973
10 Yes      88840
# ℹ 440 more rows

4.2.1 Percentage distribution of response status based on different divisions in Austrilia

I use the “geom_col()” function to create a bar chart to visualize the percentage distribution of response status as this graph type is suitable for one category variable and one numerical variable.

To make the graphic more reader-friendly, I used

  1. the “fill=” argument to make the graphic more colorful;

  2. the “facet_wrap()” argument to make the graphic include the third variable, “Division;”

  3. the “scale” argument to adjust the axis range, label, and name;

  4. the “theme()” argument to center the title and change the legend’s position from the right to the bottom of the graphic;

  5. the “ggthemes::theme_few()” to clean up the background, which makes the graphic more nitty and might help the reader focus more on the useful information;

  6. the “scale_fill_okabeito()” to make the color scale more color-blind friendly.

law_vote_Eligible_1 <-law_vote_Eligible%>%
  group_by(Division, Status) %>%
  summarise(sum_total=sum(`Eligible Participants_total_number`),
            sum_status= sum(`Values`))%>%
  mutate(prop= `sum_status`/`sum_total`)%>%
  mutate(labels= scales::percent(prop))%>%
  mutate(Division= str_remove(Division, "Divisions"))

law_vote_Eligible_1  %>% print(n = 10, width = Inf)
# A tibble: 24 × 6
# Groups:   Division [8]
   Division                        Status  sum_total sum_status    prop labels
   <chr>                           <fct>       <dbl>      <dbl>   <dbl> <chr> 
 1 "Australian Capital Territory " Yes        288108     236979 0.823   82%   
 2 "Australian Capital Territory " No         288108        534 0.00185 0%    
 3 "Australian Capital Territory " No Res.    288108      50595 0.176   18%   
 4 "New South Wales "              Yes       5187681    4111200 0.792   79%   
 5 "New South Wales "              No        5187681      11036 0.00213 0%    
 6 "New South Wales "              No Res.   5187681    1065445 0.205   21%   
 7 "Northern Territory "           Yes        138101      80376 0.582   58%   
 8 "Northern Territory "           No         138101        229 0.00166 0%    
 9 "Northern Territory "           No Res.    138101      57496 0.416   42%   
10 "Queensland "                   Yes       3150873    2448075 0.777   78%   
# ℹ 14 more rows
law_vote_Eligible_1 %>%
  ggplot(aes(y=prop, x=Status,fill=Status))+
  geom_col()+
  geom_bar(stat="Identity", alpha=.75)+
  scale_fill_okabeito(name="Response Status")+
  facet_wrap(vars(Division))+
  scale_x_discrete(name="Response Status")+
  scale_y_continuous(limits= range(0,1),name="Percentage", label = scales::percent)+
  geom_text(aes(label = labels), size=3, vjust=-.5)+
  ggthemes::theme_few()+
  labs(X= " Response Status", Y= "Percentage", title = " Percentage Distribution Of Response Status- Eligible Participants")+
  theme(plot.title = element_text(hjust=0.5))+
  theme(legend.position = "bottom")

4.2.2 Total number and percentage for response status

I created a bar chart to show the total number and percentage for response status. This is because this graph type is suitable for visualizing categorical and numerical variables.

To make the graphic more reader-friendly, I used

  1. the “fill=” argument to make the graphic more colorful;

  2. the “scale” argument to adjust the range, label, and name of the axis;

  3. the “theme()” argument to center the title and change the legend’s position from the right to the bottom of the graphic;

  4. the “ggthemes::theme_few()” to clean up the background, which makes the graphic more nitty and might help the reader focus more on the useful information;

  5. the “geom_text()” function to label the total number of various school levels.

law_vote_Eligible_2 <- law_vote_Eligible%>%
  group_by(Status) %>%
  summarise(status_total=sum(`Values`))%>%
  mutate(total=sum(status_total))%>%
  mutate(porp= status_total/total)
  
  
law_vote_Eligible_2  %>% print(n = 10, width = Inf)
# A tibble: 3 × 4
  Status  status_total    total    porp
  <fct>          <dbl>    <dbl>   <dbl>
1 Yes         12691234 16006180 0.793  
2 No             36686 16006180 0.00229
3 No Res.      3278260 16006180 0.205  
law_vote_Eligible_2 %>%
  ggplot(aes(x=Status, y=porp, fill=Status))+
   geom_col()+
  scale_x_discrete(name= "Response Status")+
  scale_y_continuous(limits= range(0,1),label = scales::percent, n.breaks = 10, name = "Percentage of Status")+
    geom_text(aes(label =status_total), size=3, vjust=-.5)+
  ggthemes::theme_few()+
  labs(title = "Total Number And Percentage For Response Status")+
  theme(plot.title = element_text(hjust=0.5))+
  theme(legend.position = "bottom")