ex1

#含有空欄位的制表符分割的文件,故使用 sep = "\t"
dta <- read.table("http://www1.aucegypt.edu/faculty/hadi/RABE5/Data5/P005.txt", header = T, sep = "\t")

head(dta)
##          City COL   PD URate     Pop Taxes Income RTWL
## 1     Atlanta 169  414  13.6 1790128  5128   2961    1
## 2      Austin 143  239  11.0  396891  4303   1711    1
## 3 Bakersfield 339   43  23.7  349874  4166   2122    0
## 4   Baltimore 173  951  21.0 2147850  5001   4654    0
## 5 Baton Rouge  99  255  16.0  411725  3965   1620    1
## 6      Boston 363 1257  24.4 3914071  4928   5634    0

ex3

source("passwd.txt")
fL <- paste0("http://",IDPW, "140.116.183.121/~sheu/dataM/Data/juniorSchools.txt")
head(dta <- read.table( fL, header = T))
##   school class sex soc ravens pupil english math year
## 1     S1    C1   G   9     23    P1      72   23    0
## 2     S1    C1   G   9     23    P1      80   24    1
## 3     S1    C1   G   9     23    P1      39   23    2
## 4     S1    C1   B   2     15    P2       7   14    0
## 5     S1    C1   B   2     15    P2      17   11    1
## 6     S1    C1   B   2     22    P3      88   36    0

ex3-1

names(dta)[3] <- "Gender"
head(dta)
##   school class Gender soc ravens pupil english math year
## 1     S1    C1      G   9     23    P1      72   23    0
## 2     S1    C1      G   9     23    P1      80   24    1
## 3     S1    C1      G   9     23    P1      39   23    2
## 4     S1    C1      B   2     15    P2       7   14    0
## 5     S1    C1      B   2     15    P2      17   11    1
## 6     S1    C1      B   2     22    P3      88   36    0

ex3-2

#重新命名factor
dta$new_soc <- factor(dta$soc, 1:9, c("I", "II", "III_0man", "III_man", "IV", "V", "VI_Unemp_L", "VII_emp_NC", "VIII_Miss_Dad"))
levels(dta$new_soc)
## [1] "I"             "II"            "III_0man"      "III_man"      
## [5] "IV"            "V"             "VI_Unemp_L"    "VII_emp_NC"   
## [9] "VIII_Miss_Dad"
plot(x = dta$new_soc, y = dta$math, 
     ylab = "Mathematics test score",
     xlab = "Social Class",
     cex.axis = 0.8)

ex3-3

write.csv(dta, "ex033d20180320.csv")

ex4

library(readr)
fL <- "http://www.amstat.org/publications/jse/datasets/aaup2.dat.txt"
readr::fwf_empty(fL)[1:2]
## $begin
##  [1]  0  6 40 45 49 53 57 61 66 70 74 79 83 87 92 95
## 
## $end
##  [1]  5 39 43 48 52 56 60 65 69 73 78 82 86 90 94 NA
head(dta <- read_fwf(fL, fwf_cols(ID = 5, University = 32, var3 = 3, 
                                  var4 = 4, var5 = 4,var6 = 4, var7 = 4,
                                  var8 = 5, var9 = 4, var10 = 4, 
                                  var11 = 5, var12 = 4, var13 = 4, 
                                  var14 = 4, var15 = 4,var16 = 4,
                                  var17 = 5), na = c("*", "NA")))
## Parsed with column specification:
## cols(
##   ID = col_integer(),
##   University = col_character(),
##   var3 = col_character(),
##   var4 = col_character(),
##   var5 = col_integer(),
##   var6 = col_integer(),
##   var7 = col_integer(),
##   var8 = col_integer(),
##   var9 = col_integer(),
##   var10 = col_integer(),
##   var11 = col_integer(),
##   var12 = col_integer(),
##   var13 = col_integer(),
##   var14 = col_integer(),
##   var15 = col_integer(),
##   var16 = col_integer(),
##   var17 = col_integer()
## )
## # A tibble: 6 x 17
##      ID University   var3  var4   var5  var6  var7  var8  var9 var10 var11
##   <int> <chr>        <chr> <chr> <int> <int> <int> <int> <int> <int> <int>
## 1  1061 Alaska Paci~ AK    IIB     454   382   362   382   567   485   471
## 2  1063 Univ.Alaska~ AK    I       686   560   432   508   914   753   572
## 3  1065 Univ.Alaska~ AK    IIA     533   494   329   415   716   663   442
## 4 11462 Univ.Alaska~ AK    IIA     612   507   414   498   825   681   557
## 5  1002 Alabama Agr~ AL    IIA     442   369   310   350   530   444   376
## 6  1004 University ~ AL    IIA     441   385   310   388   542   473   383
## # ... with 6 more variables: var12 <int>, var13 <int>, var14 <int>,
## #   var15 <int>, var16 <int>, var17 <int>

ex5

pacman::p_load(ggplot2, data.table, dplyr, magrittr, tidyr )
fL <- paste0("http://", IDPW, "140.116.183.121/~sheu/dataM/Data/Subject1.zip")
download.file(fL, "Subject1.zip", mode = "wb")
unzip("Subject1.zip")

fLs <- paste0("Subject1/", 1:4, sep = "w.dat")

dta <- lapply(fLs, read.table, header = TRUE, skip = 1, sep = "\t") %>% 
       rbindlist %>% 
       select(-X)

names(dta) <- gsub("X......", "", names(dta))
names(dta) <- gsub("[.]", "", names(dta))

new_dta <- dta %>% mutate(ID = 1:1804) %>% 
  gather(key = "Attribute", value = "score", 1:30) %>% 
  mutate(Attribute_f = as.factor(Attribute))

ggplot(new_dta, aes(Attribute_f, score))+
  geom_boxplot()+
  labs(x = "腦位置", y = "毫秒")

ex6

fL <- paste0("http://", IDPW, "140.116.183.121/~sheu/dataM/Data/cities10.txt")
fwf_empty(fL)[1:2]
## $begin
## [1]  0 19
## 
## $end
## [1] 17 NA
head(dta <- read.fwf(fL, width = c(19, 8), col.names = c("city", "population"), n = 10))
##                  city population
## 1 New York, NY          66,834.6
## 2 Kings, NY             34,722.9
## 3 Bronx, NY             31,729.8
## 4 Queens, NY            20,453.0
## 5 San Francisco, CA     16,526.2
## 6 Hudson, NJ            12,956.9
new_dta <- dta %>% separate(city, into = c("city", "state"), sep = ",") %>% 
  mutate(city = factor(city),
         population = as.numeric(population),
         state = as.factor(gsub(" ", "", state)))
ggplot(new_dta, aes(reorder(city, -population), population, fill = state))+
  geom_bar(stat="identity")+
  labs(x = "city", y = "population")

ex7

fL <- "http://www.stat.columbia.edu/~gelman/book/data/schiz.asc"
dta <- read.table(fL, skip = 4, col.names = paste0("T", 101:130))
head(dta)
##   T101 T102 T103 T104 T105 T106 T107 T108 T109 T110 T111 T112 T113 T114
## 1  312  272  350  286  268  328  298  356  292  308  296  372  396  402
## 2  354  346  384  342  302  312  322  376  306  402  320  298  308  414
## 3  256  284  320  274  324  268  370  430  314  312  362  256  342  388
## 4  260  294  306  292  264  290  272  268  344  362  330  280  354  320
## 5  204  272  250  260  314  308  246  236  208  268  272  264  308  236
## 6  590  312  286  310  778  364  318  316  316  298  344  262  274  330
##   T115 T116 T117 T118 T119 T120 T121 T122 T123 T124 T125 T126 T127 T128
## 1  280  330  254  282  350  328  332  308  292  258  340  242  306  328
## 2  304  422  388  422  426  338  332  426  478  372  392  374  430  388
## 3  302  366  298  396  274  226  328  274  258  220  236  272  322  284
## 4  334  276  418  288  338  350  350  324  286  322  280  256  218  256
## 5  238  350  272  252  252  236  306  238  350  206  260  280  274  318
## 6  312  310  376  326  346  334  282  292  282  300  290  302  300  306
##   T129 T130
## 1  294  272
## 2  354  368
## 3  274  356
## 4  220  356
## 5  268  210
## 6  294  444
new_dta <- dta %>% 
  mutate(schizoid = c(rep("Y", 11), rep("N", 6)), id = 1:17) %>% 
  gather(key = "measure", value = "ms", 1:30) %>% 
  mutate(measure = as.factor(measure))

觀察資料的描述性統計

aggregate(ms ~ schizoid, new_dta, mean)
##   schizoid       ms
## 1        N 506.8667
## 2        Y 310.1697
aggregate(ms ~ schizoid, new_dta, sd)
##   schizoid       ms
## 1        N 262.8473
## 2        Y  64.8805
ggplot(new_dta, aes(measure, ms, color = schizoid))+
  coord_flip()+
  geom_boxplot()

做ancova

m0 <- lm(ms ~ measure + schizoid, data = new_dta)
anova(m0)
## Analysis of Variance Table
## 
## Response: ms
##            Df   Sum Sq Mean Sq  F value Pr(>F)    
## measure    29   638735   22025   0.8046 0.7571    
## schizoid    1  4506212 4506212 164.6050 <2e-16 ***
## Residuals 479 13113064   27376                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

發現控制組間變異時,受試者有無精神疾病在反應時間上有差異。

ex8

fL <- paste0("http://", IDPW, "140.116.183.121/~sheu/dataM/Data/ncku_roster.csv")
headers <- read.csv(fL, header = F, nrows = 1, as.is = T)
dta <- read.csv(fL, skip = 2, header = F)
colnames(dta) <- headers
new_dta <- dta %>% 
  separate(2, into = c("major", "info"), sep = " ") %>% 
  mutate(major = as.factor(major))
## Warning: Expected 2 pieces. Additional pieces discarded in 15 rows [1, 2,
## 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15].
ggplot(new_dta, aes(major, ..count..))+
  geom_bar()+
  labs(x = "學生來源", y = "Count", subtitle = "資料管理修課系所")

pacman::p_load(RISmed)
Span <- 2001:2017
Tally <- sapply(Span, 
                function(i) {
                  QueryCount(EUtilsSummary('deep learning',
                             type = 'esearch',db = 'pubmed',
                             mindate = i, maxdate = i))
                }
)
names(Tally) <- Span
barplot(Tally, las = 2, ylim = c(0, 1500), main = "", ylab = "Number of Articles")