#含有空欄位的制表符分割的文件,故使用 sep = "\t"
dta <- read.table("http://www1.aucegypt.edu/faculty/hadi/RABE5/Data5/P005.txt", header = T, sep = "\t")
head(dta)
## City COL PD URate Pop Taxes Income RTWL
## 1 Atlanta 169 414 13.6 1790128 5128 2961 1
## 2 Austin 143 239 11.0 396891 4303 1711 1
## 3 Bakersfield 339 43 23.7 349874 4166 2122 0
## 4 Baltimore 173 951 21.0 2147850 5001 4654 0
## 5 Baton Rouge 99 255 16.0 411725 3965 1620 1
## 6 Boston 363 1257 24.4 3914071 4928 5634 0
source("passwd.txt")
fL <- paste0("http://",IDPW, "140.116.183.121/~sheu/dataM/Data/juniorSchools.txt")
head(dta <- read.table( fL, header = T))
## school class sex soc ravens pupil english math year
## 1 S1 C1 G 9 23 P1 72 23 0
## 2 S1 C1 G 9 23 P1 80 24 1
## 3 S1 C1 G 9 23 P1 39 23 2
## 4 S1 C1 B 2 15 P2 7 14 0
## 5 S1 C1 B 2 15 P2 17 11 1
## 6 S1 C1 B 2 22 P3 88 36 0
names(dta)[3] <- "Gender"
head(dta)
## school class Gender soc ravens pupil english math year
## 1 S1 C1 G 9 23 P1 72 23 0
## 2 S1 C1 G 9 23 P1 80 24 1
## 3 S1 C1 G 9 23 P1 39 23 2
## 4 S1 C1 B 2 15 P2 7 14 0
## 5 S1 C1 B 2 15 P2 17 11 1
## 6 S1 C1 B 2 22 P3 88 36 0
#重新命名factor
dta$new_soc <- factor(dta$soc, 1:9, c("I", "II", "III_0man", "III_man", "IV", "V", "VI_Unemp_L", "VII_emp_NC", "VIII_Miss_Dad"))
levels(dta$new_soc)
## [1] "I" "II" "III_0man" "III_man"
## [5] "IV" "V" "VI_Unemp_L" "VII_emp_NC"
## [9] "VIII_Miss_Dad"
plot(x = dta$new_soc, y = dta$math,
ylab = "Mathematics test score",
xlab = "Social Class",
cex.axis = 0.8)
write.csv(dta, "ex033d20180320.csv")
library(readr)
fL <- "http://www.amstat.org/publications/jse/datasets/aaup2.dat.txt"
readr::fwf_empty(fL)[1:2]
## $begin
## [1] 0 6 40 45 49 53 57 61 66 70 74 79 83 87 92 95
##
## $end
## [1] 5 39 43 48 52 56 60 65 69 73 78 82 86 90 94 NA
head(dta <- read_fwf(fL, fwf_cols(ID = 5, University = 32, var3 = 3,
var4 = 4, var5 = 4,var6 = 4, var7 = 4,
var8 = 5, var9 = 4, var10 = 4,
var11 = 5, var12 = 4, var13 = 4,
var14 = 4, var15 = 4,var16 = 4,
var17 = 5), na = c("*", "NA")))
## Parsed with column specification:
## cols(
## ID = col_integer(),
## University = col_character(),
## var3 = col_character(),
## var4 = col_character(),
## var5 = col_integer(),
## var6 = col_integer(),
## var7 = col_integer(),
## var8 = col_integer(),
## var9 = col_integer(),
## var10 = col_integer(),
## var11 = col_integer(),
## var12 = col_integer(),
## var13 = col_integer(),
## var14 = col_integer(),
## var15 = col_integer(),
## var16 = col_integer(),
## var17 = col_integer()
## )
## # A tibble: 6 x 17
## ID University var3 var4 var5 var6 var7 var8 var9 var10 var11
## <int> <chr> <chr> <chr> <int> <int> <int> <int> <int> <int> <int>
## 1 1061 Alaska Paci~ AK IIB 454 382 362 382 567 485 471
## 2 1063 Univ.Alaska~ AK I 686 560 432 508 914 753 572
## 3 1065 Univ.Alaska~ AK IIA 533 494 329 415 716 663 442
## 4 11462 Univ.Alaska~ AK IIA 612 507 414 498 825 681 557
## 5 1002 Alabama Agr~ AL IIA 442 369 310 350 530 444 376
## 6 1004 University ~ AL IIA 441 385 310 388 542 473 383
## # ... with 6 more variables: var12 <int>, var13 <int>, var14 <int>,
## # var15 <int>, var16 <int>, var17 <int>
pacman::p_load(ggplot2, data.table, dplyr, magrittr, tidyr )
fL <- paste0("http://", IDPW, "140.116.183.121/~sheu/dataM/Data/Subject1.zip")
download.file(fL, "Subject1.zip", mode = "wb")
unzip("Subject1.zip")
fLs <- paste0("Subject1/", 1:4, sep = "w.dat")
dta <- lapply(fLs, read.table, header = TRUE, skip = 1, sep = "\t") %>%
rbindlist %>%
select(-X)
names(dta) <- gsub("X......", "", names(dta))
names(dta) <- gsub("[.]", "", names(dta))
new_dta <- dta %>% mutate(ID = 1:1804) %>%
gather(key = "Attribute", value = "score", 1:30) %>%
mutate(Attribute_f = as.factor(Attribute))
ggplot(new_dta, aes(Attribute_f, score))+
geom_boxplot()+
labs(x = "腦位置", y = "毫秒")
fL <- paste0("http://", IDPW, "140.116.183.121/~sheu/dataM/Data/cities10.txt")
fwf_empty(fL)[1:2]
## $begin
## [1] 0 19
##
## $end
## [1] 17 NA
head(dta <- read.fwf(fL, width = c(19, 8), col.names = c("city", "population"), n = 10))
## city population
## 1 New York, NY 66,834.6
## 2 Kings, NY 34,722.9
## 3 Bronx, NY 31,729.8
## 4 Queens, NY 20,453.0
## 5 San Francisco, CA 16,526.2
## 6 Hudson, NJ 12,956.9
new_dta <- dta %>% separate(city, into = c("city", "state"), sep = ",") %>%
mutate(city = factor(city),
population = as.numeric(population),
state = as.factor(gsub(" ", "", state)))
ggplot(new_dta, aes(reorder(city, -population), population, fill = state))+
geom_bar(stat="identity")+
labs(x = "city", y = "population")
fL <- "http://www.stat.columbia.edu/~gelman/book/data/schiz.asc"
dta <- read.table(fL, skip = 4, col.names = paste0("T", 101:130))
head(dta)
## T101 T102 T103 T104 T105 T106 T107 T108 T109 T110 T111 T112 T113 T114
## 1 312 272 350 286 268 328 298 356 292 308 296 372 396 402
## 2 354 346 384 342 302 312 322 376 306 402 320 298 308 414
## 3 256 284 320 274 324 268 370 430 314 312 362 256 342 388
## 4 260 294 306 292 264 290 272 268 344 362 330 280 354 320
## 5 204 272 250 260 314 308 246 236 208 268 272 264 308 236
## 6 590 312 286 310 778 364 318 316 316 298 344 262 274 330
## T115 T116 T117 T118 T119 T120 T121 T122 T123 T124 T125 T126 T127 T128
## 1 280 330 254 282 350 328 332 308 292 258 340 242 306 328
## 2 304 422 388 422 426 338 332 426 478 372 392 374 430 388
## 3 302 366 298 396 274 226 328 274 258 220 236 272 322 284
## 4 334 276 418 288 338 350 350 324 286 322 280 256 218 256
## 5 238 350 272 252 252 236 306 238 350 206 260 280 274 318
## 6 312 310 376 326 346 334 282 292 282 300 290 302 300 306
## T129 T130
## 1 294 272
## 2 354 368
## 3 274 356
## 4 220 356
## 5 268 210
## 6 294 444
new_dta <- dta %>%
mutate(schizoid = c(rep("Y", 11), rep("N", 6)), id = 1:17) %>%
gather(key = "measure", value = "ms", 1:30) %>%
mutate(measure = as.factor(measure))
觀察資料的描述性統計
aggregate(ms ~ schizoid, new_dta, mean)
## schizoid ms
## 1 N 506.8667
## 2 Y 310.1697
aggregate(ms ~ schizoid, new_dta, sd)
## schizoid ms
## 1 N 262.8473
## 2 Y 64.8805
ggplot(new_dta, aes(measure, ms, color = schizoid))+
coord_flip()+
geom_boxplot()
做ancova
m0 <- lm(ms ~ measure + schizoid, data = new_dta)
anova(m0)
## Analysis of Variance Table
##
## Response: ms
## Df Sum Sq Mean Sq F value Pr(>F)
## measure 29 638735 22025 0.8046 0.7571
## schizoid 1 4506212 4506212 164.6050 <2e-16 ***
## Residuals 479 13113064 27376
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
發現控制組間變異時,受試者有無精神疾病在反應時間上有差異。
fL <- paste0("http://", IDPW, "140.116.183.121/~sheu/dataM/Data/ncku_roster.csv")
headers <- read.csv(fL, header = F, nrows = 1, as.is = T)
dta <- read.csv(fL, skip = 2, header = F)
colnames(dta) <- headers
new_dta <- dta %>%
separate(2, into = c("major", "info"), sep = " ") %>%
mutate(major = as.factor(major))
## Warning: Expected 2 pieces. Additional pieces discarded in 15 rows [1, 2,
## 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15].
ggplot(new_dta, aes(major, ..count..))+
geom_bar()+
labs(x = "學生來源", y = "Count", subtitle = "資料管理修課系所")
pacman::p_load(RISmed)
Span <- 2001:2017
Tally <- sapply(Span,
function(i) {
QueryCount(EUtilsSummary('deep learning',
type = 'esearch',db = 'pubmed',
mindate = i, maxdate = i))
}
)
names(Tally) <- Span
barplot(Tally, las = 2, ylim = c(0, 1500), main = "", ylab = "Number of Articles")