0330_Exercise 1

Select at random one school per county in the data set Caschool{Ecdat} and draw a scatter diagram of average math score mathscr against average reading score readscr for the sampled data set. Make sure your results are reproducible (e.g., the same random sample will be drawn each time).

#load data
dta <- Ecdat::Caschool
head(dta)
##   distcod  county                        district grspan enrltot teachers
## 1   75119 Alameda              Sunol Glen Unified  KK-08     195    10.90
## 2   61499   Butte            Manzanita Elementary  KK-08     240    11.15
## 3   61549   Butte     Thermalito Union Elementary  KK-08    1550    82.90
## 4   61457   Butte Golden Feather Union Elementary  KK-08     243    14.00
## 5   61523   Butte        Palermo Union Elementary  KK-08    1335    71.50
## 6   62042  Fresno         Burrel Union Elementary  KK-08     137     6.40
##   calwpct mealpct computer testscr   compstu  expnstu      str    avginc
## 1  0.5102  2.0408       67  690.80 0.3435898 6384.911 17.88991 22.690001
## 2 15.4167 47.9167      101  661.20 0.4208333 5099.381 21.52466  9.824000
## 3 55.0323 76.3226      169  643.60 0.1090323 5501.955 18.69723  8.978000
## 4 36.4754 77.0492       85  647.70 0.3497942 7101.831 17.35714  8.978000
## 5 33.1086 78.4270      171  640.85 0.1280899 5235.988 18.67133  9.080333
## 6 12.3188 86.9565       25  605.55 0.1824818 5580.147 21.40625 10.415000
##       elpct readscr mathscr
## 1  0.000000   691.6   690.0
## 2  4.583333   660.5   661.9
## 3 30.000002   636.3   650.9
## 4  0.000000   651.9   643.5
## 5 13.857677   641.8   639.9
## 6 12.408759   605.7   605.4
names(dta)
##  [1] "distcod"  "county"   "district" "grspan"   "enrltot"  "teachers"
##  [7] "calwpct"  "mealpct"  "computer" "testscr"  "compstu"  "expnstu" 
## [13] "str"      "avginc"   "elpct"    "readscr"  "mathscr"
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
#'set.seed' to confirm the reproducibilty
set.seed(1001)
#extract 1 school from each county 
dta_r <- dta %>% group_by(.,county) %>% sample_n(1)
plot(dta_r$mathscr, dta_r$readscr, xlab= "Average math score", ylab="Average reading score")

0330_Exercise 2

Find 133 class-level 95%-confidence intervals for language test score means of the nlschools{MASS} data set by using the tidy approach.

# load data
dta2 <- MASS::nlschools
head(dta2)
##   lang   IQ class GS SES COMB
## 1   46 15.0   180 29  23    0
## 2   45 14.5   180 29  10    0
## 3   33  9.5   180 29  15    0
## 4   46 11.0   180 29  23    0
## 5   20  8.0   180 29  10    0
## 6   30  9.5   180 29  10    0
str(dta2)
## 'data.frame':    2287 obs. of  6 variables:
##  $ lang : int  46 45 33 46 20 30 30 57 36 36 ...
##  $ IQ   : num  15 14.5 9.5 11 8 9.5 9.5 13 9.5 11 ...
##  $ class: Factor w/ 133 levels "180","280","1082",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ GS   : int  29 29 29 29 29 29 29 29 29 29 ...
##  $ SES  : int  23 10 15 23 10 10 23 10 13 15 ...
##  $ COMB : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
dta2 %>% 
  mutate(classID = factor(class, levels = levels(class), labels = c(1:length(levels(.$class))))) %>%  
  group_by(classID) %>% 
  summarize(language_mean= mean(lang, na.rm=T),
           language_SE=sd(lang, na.rm=T)/sqrt(n() ),
           language_lb= language_mean - 1.96*language_SE,
           language_ub= language_mean + 1.96*language_SE) %>%
  tail(.,3)
## # A tibble: 3 x 5
##   classID language_mean language_SE language_lb language_ub
##   <fct>           <dbl>       <dbl>       <dbl>       <dbl>
## 1 131              38.1        1.71        34.7        41.4
## 2 132              29.3        4.20        21.1        37.5
## 3 133              28.4        2.64        23.3        33.6

0330_Exercise 3

Use the Prestige{car} data set for this problem. -Find the median prestige score for each of the three types of occupation, respectively. -Use the median score in each type of occupation to define two levels of prestige: High and low, for each occupation, respectively. Summarize the relationship between income and education for each category generated from crossing the factor prestige with the type of occupation.

#load data
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
head(Prestige)
##                     education income women prestige census type
## gov.administrators      13.11  12351 11.16     68.8   1113 prof
## general.managers        12.26  25879  4.02     69.1   1130 prof
## accountants             12.77   9271 15.70     63.4   1171 prof
## purchasing.officers     11.42   8865  9.11     56.8   1175 prof
## chemists                14.62   8403 11.68     73.5   2111 prof
## physicists              15.64  11030  5.13     77.6   2113 prof
str(Prestige)
## 'data.frame':    102 obs. of  6 variables:
##  $ education: num  13.1 12.3 12.8 11.4 14.6 ...
##  $ income   : int  12351 25879 9271 8865 8403 11030 8258 14163 11377 11023 ...
##  $ women    : num  11.16 4.02 15.7 9.11 11.68 ...
##  $ prestige : num  68.8 69.1 63.4 56.8 73.5 77.6 72.6 78.1 73.1 68.8 ...
##  $ census   : int  1113 1130 1171 1175 2111 2113 2133 2141 2143 2153 ...
##  $ type     : Factor w/ 3 levels "bc","prof","wc": 2 2 2 2 2 2 2 2 2 2 ...
# summarise median for each occupation

dta3 <- Prestige %>%
  group_by(type)%>%
   mutate(pmedian = median(prestige),
         plevel = case_when(prestige > pmedian ~ "High",
                            prestige < pmedian ~ "Low")) 
## Warning: Factor `type` contains implicit NA, consider using
## `forcats::fct_explicit_na`

## Warning: Factor `type` contains implicit NA, consider using
## `forcats::fct_explicit_na`
#xyplot by type
library(lattice)
dta3 %>%
  xyplot(income ~ education | type, groups =plevel, data = ., type = c("g","p","r"))

0330_Exercise 4

Reverse the order of input to the series of dplyr::*_join examples using data from the Nobel laureates in literature and explain the resulting output.

dta41 <- read.table("nobel_countries.txt", header = T)
str(dta41)
## 'data.frame':    8 obs. of  2 variables:
##  $ Country: Factor w/ 7 levels "Canada","China",..: 3 6 6 7 1 2 4 5
##  $ Year   : int  2014 1950 2017 2016 2013 2012 2015 2011
dta42 <- read.table("nobel_winners.txt", header = T)
str(dta42)
## 'data.frame':    7 obs. of  3 variables:
##  $ Name  : Factor w/ 7 levels "Alice  Munro",..: 6 2 4 3 1 5 7
##  $ Gender: Factor w/ 2 levels "Female","Male": 2 2 2 2 1 2 1
##  $ Year  : int  2014 1950 2017 2016 2013 2012 1938
#merge data by `left_join`, Joining, by = "Year"
library(dplyr)
dta4 <- left_join(dta41, dta42)
## Joining, by = "Year"
arrange(dta4, by= Year) #arranged by year
##   Country Year              Name Gender
## 1      UK 1950 Bertrand  Russell   Male
## 2  Sweden 2011              <NA>   <NA>
## 3   China 2012            Mo Yan   Male
## 4  Canada 2013      Alice  Munro Female
## 5  France 2014   Patrick Modiano   Male
## 6  Russia 2015              <NA>   <NA>
## 7      US 2016        Bob  Dylan   Male
## 8      UK 2017    Kazuo Ishiguro   Male

0330_Exercise 5

Augment the data object in the ‘SAT’ lecture note with state.division{datasets}. For each of the 9 divisions, find the slope estimate for regressing average SAT scores onto average teacher’s salary. How many of them are of negative signs?

str(state.division)
##  Factor w/ 9 levels "New England",..: 4 9 8 5 9 8 1 3 3 3 ...
fL <- "http://www.amstat.org/publications/jse/datasets/sat.dat.txt"
dta5 <- read.table(fL, row.names=1)
names(dta5) <- c("Spending", "PTR", "Salary", "PE", "Verbal", "Math", "SAT")
dta5$Region <- state.division
head(dta5)
##            Spending  PTR Salary PE Verbal Math  SAT             Region
## Alabama       4.405 17.2 31.144  8    491  538 1029 East South Central
## Alaska        8.963 17.6 47.951 47    445  489  934            Pacific
## Arizona       4.778 19.3 32.175 27    448  496  944           Mountain
## Arkansas      4.459 17.1 28.934  6    482  523 1005 West South Central
## California    4.992 24.0 41.078 45    417  485  902            Pacific
## Colorado      5.443 18.4 34.571 29    462  518  980           Mountain
dta5 %>% xyplot(SAT~Salary | Region, type=c("g", "r", "p"), data=.)

#**Negative signs:
  #1. Mountain, West South Central(*strong*)
  #2. West North Central, East South Centraol(*slightly*)**