0330 HW 1-3

HW1

Select at random one school per county in the data set Caschool{Ecdat} and draw a scatter diagram of average math score mathscr against average reading score readscr for the sampled data set. Make sure your results are reproducible (e.g., the same random sample will be drawn each time).

load the data and check data structure, display first 6 rows of Vocab

library(Ecdat)

## Warning: package 'Ecdat' was built under R version 3.6.3

## Warning: package 'Ecfun' was built under R version 3.6.3

head(Caschool)

##   distcod  county                        district grspan enrltot teachers
## 1   75119 Alameda              Sunol Glen Unified  KK-08     195    10.90
## 2   61499   Butte            Manzanita Elementary  KK-08     240    11.15
## 3   61549   Butte     Thermalito Union Elementary  KK-08    1550    82.90
## 4   61457   Butte Golden Feather Union Elementary  KK-08     243    14.00
## 5   61523   Butte        Palermo Union Elementary  KK-08    1335    71.50
## 6   62042  Fresno         Burrel Union Elementary  KK-08     137     6.40
##   calwpct mealpct computer testscr   compstu  expnstu      str    avginc
## 1  0.5102  2.0408       67  690.80 0.3435898 6384.911 17.88991 22.690001
## 2 15.4167 47.9167      101  661.20 0.4208333 5099.381 21.52466  9.824000
## 3 55.0323 76.3226      169  643.60 0.1090323 5501.955 18.69723  8.978000
## 4 36.4754 77.0492       85  647.70 0.3497942 7101.831 17.35714  8.978000
## 5 33.1086 78.4270      171  640.85 0.1280899 5235.988 18.67133  9.080333
## 6 12.3188 86.9565       25  605.55 0.1824818 5580.147 21.40625 10.415000
##       elpct readscr mathscr
## 1  0.000000   691.6   690.0
## 2  4.583333   660.5   661.9
## 3 30.000002   636.3   650.9
## 4  0.000000   651.9   643.5
## 5 13.857677   641.8   639.9
## 6 12.408759   605.7   605.4

str(Caschool)

## 'data.frame':    420 obs. of  17 variables:
##  $ distcod : int  75119 61499 61549 61457 61523 62042 68536 63834 62331 67306 ...
##  $ county  : Factor w/ 45 levels "Alameda","Butte",..: 1 2 2 2 2 6 29 11 6 25 ...
##  $ district: Factor w/ 409 levels "Ackerman Elementary",..: 362 214 367 132 270 53 152 383 263 94 ...
##  $ grspan  : Factor w/ 2 levels "KK-06","KK-08": 2 2 2 2 2 2 2 2 2 1 ...
##  $ enrltot : int  195 240 1550 243 1335 137 195 888 379 2247 ...
##  $ teachers: num  10.9 11.1 82.9 14 71.5 ...
##  $ calwpct : num  0.51 15.42 55.03 36.48 33.11 ...
##  $ mealpct : num  2.04 47.92 76.32 77.05 78.43 ...
##  $ computer: int  67 101 169 85 171 25 28 66 35 0 ...
##  $ testscr : num  691 661 644 648 641 ...
##  $ compstu : num  0.344 0.421 0.109 0.35 0.128 ...
##  $ expnstu : num  6385 5099 5502 7102 5236 ...
##  $ str     : num  17.9 21.5 18.7 17.4 18.7 ...
##  $ avginc  : num  22.69 9.82 8.98 8.98 9.08 ...
##  $ elpct   : num  0 4.58 30 0 13.86 ...
##  $ readscr : num  692 660 636 652 642 ...
##  $ mathscr : num  690 662 651 644 640 ...

sample by strata

sample_dta<-sampling::strata(Caschool, stratanames="county", size=rep(1,length(levels(Caschool[ ,2]))), method="srswor")

remain those rows are concistent with smaple_dta

library(dplyr)

## Warning: package 'dplyr' was built under R version 3.6.3

df_sample<-Caschool%>%filter(rownames(Caschool)%in%sample_dta$ID_unit)

remain those rows are concistent with smaple_dta

df_sample<-Caschool%>%filter(rownames(Caschool)%in%sample_dta$ID_unit)
head(df_sample)

##   distcod      county                        district grspan enrltot teachers
## 1   75119     Alameda              Sunol Glen Unified  KK-08     195    10.90
## 2   61457       Butte Golden Feather Union Elementary  KK-08     243    14.00
## 3   68379   San Diego           San Ysidro Elementary  KK-08    4142   201.00
## 4   65870      Merced               Winton Elementary  KK-08    1657    90.40
## 5   69369 Santa Clara      Alum Rock Union Elementary  KK-08   16244   766.65
## 6   70409    Siskiyou        McCloud Union Elementary  KK-08     175    12.50
##   calwpct mealpct computer testscr    compstu  expnstu      str   avginc
## 1  0.5102  2.0408       67  690.80 0.34358975 6384.911 17.88991 22.69000
## 2 36.4754 77.0492       85  647.70 0.34979424 7101.831 17.35714  8.97800
## 3 35.5625 81.5065      569  624.55 0.13737325 5342.233 20.60697  6.61300
## 4 28.8473 84.7314      204  626.90 0.12311406 5225.719 18.32965  9.59800
## 5 17.4531 69.7399     1423  630.35 0.08760157 5645.496 21.18829 12.58158
## 6 21.7647 68.8235       31  635.60 0.17714286 6653.031 14.00000 10.65600
##      elpct readscr mathscr
## 1  0.00000   691.6   690.0
## 2  0.00000   651.9   643.5
## 3 80.42009   620.4   628.7
## 4 56.12553   623.6   630.2
## 5 49.86457   629.5   631.2
## 6  0.00000   634.6   636.6

scatter plot

lattice::xyplot(readscr~mathscr, groups=county, data=df_sample, auto.key=list(column=5), ylab="Read score", xlab="Math Score")

HW 2

Find 133 class-level 95%-confidence intervals for language test score means of the nlschools{MASS} data set by using the tidy approach. The tail end of the data object should looks as follows:

load data and check data structure

dta<-MASS::nlschools
head(dta)

##   lang   IQ class GS SES COMB
## 1   46 15.0   180 29  23    0
## 2   45 14.5   180 29  10    0
## 3   33  9.5   180 29  15    0
## 4   46 11.0   180 29  23    0
## 5   20  8.0   180 29  10    0
## 6   30  9.5   180 29  10    0

tidy approach

dta1<-dta%>%group_by(class)%>%
 dplyr::summarize(n=n(), # using dplyr:: to avoid conflict
                  IQ_mean=mean(IQ),
                  language_mean=mean(lang),
                  language_se=sd(lang)/sqrt(n), 
                  language_lb=language_mean-1.96*language_se,
                  language_ub=language_mean+1.96*language_se) %>% 
  mutate(classID= 1:length(levels(dta$class))) %>%
  select(classID, IQ_mean, language_mean, language_lb, language_ub) 
tail(dta1)

## # A tibble: 6 x 5
##   classID IQ_mean language_mean language_lb language_ub
##     <int>   <dbl>         <dbl>       <dbl>       <dbl>
## 1     128    12.8          45.5        41.1        49.9
## 2     129    12.2          41.5        39.0        44.0
## 3     130    11.9          40.3        35.2        45.3
## 4     131    11.3          38.1        34.7        41.4
## 5     132    10.6          29.3        21.1        37.5
## 6     133    10.6          28.4        23.3        33.6

1.group by class.
2.calculate the means, 95%CI.
3.create new variable “classID”.
4.select variables to save in dta1.
5.display last 6 rows of dta1.

HW3

Use the Prestige{car} data set for this problem. Find the median prestige score for each of the three types of occupation, respectively. Use the median score in each type of occupation to define two levels of prestige: High and low, for each occupation, respectively. Summarize the relationship between income and education for each category generated from crossing the factor prestige with the type of occupation.

load data and check data structure

library(carData)
str(Prestige)

## 'data.frame':    102 obs. of  6 variables:
##  $ education: num  13.1 12.3 12.8 11.4 14.6 ...
##  $ income   : int  12351 25879 9271 8865 8403 11030 8258 14163 11377 11023 ...
##  $ women    : num  11.16 4.02 15.7 9.11 11.68 ...
##  $ prestige : num  68.8 69.1 63.4 56.8 73.5 77.6 72.6 78.1 73.1 68.8 ...
##  $ census   : int  1113 1130 1171 1175 2111 2113 2133 2141 2143 2153 ...
##  $ type     : Factor w/ 3 levels "bc","prof","wc": 2 2 2 2 2 2 2 2 2 2 ...

head(Prestige)

##                     education income women prestige census type
## gov.administrators      13.11  12351 11.16     68.8   1113 prof
## general.managers        12.26  25879  4.02     69.1   1130 prof
## accountants             12.77   9271 15.70     63.4   1171 prof
## purchasing.officers     11.42   8865  9.11     56.8   1175 prof
## chemists                14.62   8403 11.68     73.5   2111 prof
## physicists              15.64  11030  5.13     77.6   2113 prof

median prestige score for each of the three types of occupation, respectively.

m<-aggregate(prestige~type, data=Prestige, median)
m

##   type prestige
## 1   bc     35.9
## 2 prof     68.4
## 3   wc     41.5

Use the median score in each type of occupation to define two levels of prestige: High and low, for each occupation

dta<-Prestige%>%mutate(occup=rownames(Prestige))
dta_bc<-dta%>%filter(dta$type=="bc")%>%mutate(level_prestige=case_when(prestige<=m[1, 2]~"Low", prestige>m[1,2]~"High"))
dta_prof<-dta%>%filter(dta$type=="prof")%>%mutate(level_prestige=case_when(prestige<=m[2, 2]~"Low", prestige>m[2,2]~"High"))
dta_wc<-dta%>%filter(dta$type=="wc")%>%mutate(level_prestige=case_when(prestige<=m[3, 2]~"Low", prestige>m[3,2]~"High"))
dta_1<-rbind(dta_bc, dta_prof, dta_wc)
str(dta_1)

## 'data.frame':    98 obs. of  8 variables:
##  $ education     : num  9.45 9.93 9.47 10.93 7.74 ...
##  $ income        : int  3485 2370 8895 8891 3116 3930 7869 3000 3472 3582 ...
##  $ women         : num  76.14 3.69 0 1.65 52 ...
##  $ prestige      : num  34.9 23.3 43.5 51.6 29.7 20.2 54.9 20.8 17.3 20.1 ...
##  $ census        : int  3135 5145 6111 6112 6121 6123 6141 6162 6191 6193 ...
##  $ type          : Factor w/ 3 levels "bc","prof","wc": 1 1 1 1 1 1 1 1 1 1 ...
##  $ occup         : chr  "nursing.aides" "service.station.attendant" "firefighters" "policemen" ...
##  $ level_prestige: chr  "Low" "Low" "High" "High" ...

head(dta_1)

##   education income women prestige census type                     occup
## 1      9.45   3485 76.14     34.9   3135   bc             nursing.aides
## 2      9.93   2370  3.69     23.3   5145   bc service.station.attendant
## 3      9.47   8895  0.00     43.5   6111   bc              firefighters
## 4     10.93   8891  1.65     51.6   6112   bc                 policemen
## 5      7.74   3116 52.00     29.7   6121   bc                     cooks
## 6      8.50   3930 15.51     20.2   6123   bc                bartenders
##   level_prestige
## 1            Low
## 2            Low
## 3           High
## 4           High
## 5            Low
## 6            Low

Summarize the relationship between income and education for each category generated from crossing the factor prestige with the type of occupation.

ggplot2::qplot(income, education, data=dta_1, geom = c('point', 'abline'),
      col=level_prestige, facets = level_prestige ~ type)

correlation between education and income in types of occupation

library(plyr)

## Warning: package 'plyr' was built under R version 3.6.3

ddply(dta_1, "type", summarise, corr=cor(education, income))

##   type      corr
## 1   bc 0.5032838
## 2 prof 0.2827775
## 3   wc 0.1409227

correlation between education and income between two levels of prestige score across types of occupation

# type: bc
ddply(dta_bc, "level_prestige", summarise, corr=cor(education, income))

##   level_prestige       corr
## 1           High 0.44956311
## 2            Low 0.05737649

# type: prof
ddply(dta_prof, "level_prestige", summarise, corr=cor(education, income))

##   level_prestige         corr
## 1           High 1.661953e-06
## 2            Low 3.728961e-01

# type: wc
ddply(dta_1, "level_prestige", summarise, corr=cor(education, income))

##   level_prestige      corr
## 1           High 0.4901340
## 2            Low 0.6467921