# locale setting for Chinese encoding
Sys.setlocale(locale = "Chinese")
# read data
dta_roster <- read.csv("ncku_roster.csv",
header = T, fileEncoding = "big5", fill = T)
# remove the first line
dta_roster <- dta_roster[-1, ]
# substract major from coulmun major.year.class and store it in column major
dta_roster$major <- substr(dta_roster$系.年.班, 1, 3)
# show the numbers of students from each major.
table(dta_roster$major)
##
## 教育所 心理所 心理系
## 4 7 4
dta_rtw <- read.table("P005.txt", sep = "\t", header = T)
head(dta_rtw)
## City COL PD URate Pop Taxes Income RTWL
## 1 Atlanta 169 414 13.6 1790128 5128 2961 1
## 2 Austin 143 239 11.0 396891 4303 1711 1
## 3 Bakersfield 339 43 23.7 349874 4166 2122 0
## 4 Baltimore 173 951 21.0 2147850 5001 4654 0
## 5 Baton Rouge 99 255 16.0 411725 3965 1620 1
## 6 Boston 363 1257 24.4 3914071 4928 5634 0
plot(Taxes ~ Income, data = dta_rtw)
In this scatter plot, I cannot find any relationship between variable Income and variable Taxes. I think that different cities might have different income level and tax policies. Therefore, it is necessary to provide more information for this graph. For example, if it is possible to make a category for tax policies and label those cities with similar policy, then we examine the relationship between taxes and income within same category. Besides, there are also other varibales in the data frame that might provide useful information, but I don’t know what they stand for.
jsp <- read.table("juniorSchools.txt", sep = "\t", header = T)
head(jsp)
## school class sex soc ravens pupil english math year
## 1 S1 C1 G 9 23 P1 72 23 0
## 2 S1 C1 G 9 23 P1 80 24 1
## 3 S1 C1 G 9 23 P1 39 23 2
## 4 S1 C1 B 2 15 P2 7 14 0
## 5 S1 C1 B 2 15 P2 17 11 1
## 6 S1 C1 B 2 22 P3 88 36 0
# Re-name the variable 'sex'
colnames(jsp)[3] <- "gender"
colnames(jsp)
## [1] "school" "class" "gender" "soc" "ravens" "pupil" "english"
## [8] "math" "year"
# Re-label the values of the social class variable
jsp[jsp$soc == 1, ]$soc <- "I"
jsp[jsp$soc == 2, ]$soc <- "II"
jsp[jsp$soc == 3, ]$soc <- "III_0man"
jsp[jsp$soc == 4, ]$soc <- "III_man"
jsp[jsp$soc == 5, ]$soc <- "IV"
jsp[jsp$soc == 6, ]$soc <- "V"
jsp[jsp$soc == 7, ]$soc <- "VI_Unemp_L"
jsp[jsp$soc == 8, ]$soc <- "VII_emp_NC"
jsp[jsp$soc == 9, ]$soc <- "VIII_Miss_Dad"
# change the type to factor
jsp$soc <- as.factor(jsp$soc)
boxplot(data = jsp,
math ~ soc)
# create a data folder
dir.create("/data")
## Warning in dir.create("/data"): '\data' already exists
# save it as a R data format
dump("jsp", file="/data/jsp.Rdta")
# load it back
source("/data/jsp.Rdta")
# read four files seperatly
w1 <- read.table("Subject1/1w.dat", sep = "\t", skip = 1, header = T)
w2 <- read.table("Subject1/2w.dat", sep = "\t", skip = 1, header = T)
w3 <- read.table("Subject1/3w.dat", sep = "\t", skip = 1, header = T)
w4 <- read.table("Subject1/4w.dat", sep = "\t", skip = 1, header = T)
# combine 4 objects into one
dta <- dplyr::bind_rows(w1, w2, w3, w4)
# remove the last column which is empty
dta <- dta[, -31]
# print the first 6 lines
head(dta)
## X.......F7. X......FT7. X.......T7. X......TP7. X.......P7. X......Fp1.
## 1 -0.9733 -1.0071 -0.1834 -1.0521 -0.7046 -1.1503
## 2 -0.7079 -1.0216 -0.1705 -1.1422 -0.7915 -1.0843
## 3 -0.3732 -0.9813 -0.1544 -1.1760 -0.8205 -1.0071
## 4 -0.0225 -0.8784 -0.1239 -1.0891 -0.8173 -0.9218
## 5 0.3523 -0.7095 -0.0611 -0.8800 -0.6966 -0.8173
## 6 0.6998 -0.5116 0.0483 -0.5711 -0.5277 -0.7095
## X.......F3. X......FC3. X.......C3. X......CP3. X.......P3. X.......O1.
## 1 -1.0425 -0.5212 -0.2477 -0.0064 0.2349 0.6226
## 2 -1.0167 -0.5196 -0.2638 0.0064 0.2445 0.4842
## 3 -0.9556 -0.4987 -0.2494 0.0595 0.3041 0.3893
## 4 -0.8639 -0.4569 -0.2043 0.1351 0.3845 0.3105
## 5 -0.7481 -0.3604 -0.1400 0.2059 0.4923 0.2687
## 6 -0.6274 -0.2397 -0.0547 0.2703 0.5936 0.3089
## X.......Fz. X......FCz. X.......Cz. X......CPz. X.......Pz. X.......Oz.
## 1 -0.3620 0.2027 0.3218 0.5293 0.0692 0.9218
## 2 -0.3008 0.2236 0.2751 0.4617 -0.0322 0.8205
## 3 -0.2365 0.2494 0.2204 0.4408 -0.0949 0.7577
## 4 -0.1770 0.2848 0.1947 0.4328 -0.1078 0.7239
## 5 -0.1384 0.2928 0.1673 0.4038 -0.1046 0.7256
## 6 -0.0901 0.2767 0.1512 0.3362 -0.0724 0.7658
## X......Fp2. X.......F4. X......FC4. X.......C4. X......CP4. X.......P4.
## 1 -1.2339 -0.6886 0.7561 0.7931 0.5711 0.9556
## 2 -1.1213 -0.6435 0.7529 0.7143 0.6306 0.9331
## 3 -0.9315 -0.5792 0.7497 0.6564 0.6596 0.9170
## 4 -0.7062 -0.5325 0.7352 0.6017 0.6483 0.8961
## 5 -0.5164 -0.5196 0.6773 0.5357 0.5904 0.8205
## 6 -0.3781 -0.5052 0.6033 0.4102 0.4617 0.6757
## X.......O2. X.......F8. X......FT8. X.......T8. X......TP8. X.......P8.
## 1 0.9090 0.0611 -0.3459 0.5068 1.4415 1.4479
## 2 0.7626 0.1673 -0.2864 0.8092 1.8243 1.4704
## 3 0.6451 0.2461 -0.1657 1.0972 2.1622 1.4672
## 4 0.5373 0.2687 -0.0322 1.3498 2.4083 1.4173
## 5 0.4698 0.2059 0.0676 1.5106 2.5258 1.3385
## 6 0.4488 0.0627 0.1593 1.5412 2.4389 1.1937
I have tried to use “lapply” for typing efficiency. but i don’t know how to provide perameters for the function in “lapply”. In this case, to read these 4 files I have to provide some perameters for the function read.table.
# use "skip" parameter to ignore the first 4 lines.
dta_schiz <- read.table("schiz.asc.txt", sep = "", skip = 4)
head(dta_schiz)
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19
## 1 312 272 350 286 268 328 298 356 292 308 296 372 396 402 280 330 254 282 350
## 2 354 346 384 342 302 312 322 376 306 402 320 298 308 414 304 422 388 422 426
## 3 256 284 320 274 324 268 370 430 314 312 362 256 342 388 302 366 298 396 274
## 4 260 294 306 292 264 290 272 268 344 362 330 280 354 320 334 276 418 288 338
## 5 204 272 250 260 314 308 246 236 208 268 272 264 308 236 238 350 272 252 252
## 6 590 312 286 310 778 364 318 316 316 298 344 262 274 330 312 310 376 326 346
## V20 V21 V22 V23 V24 V25 V26 V27 V28 V29 V30
## 1 328 332 308 292 258 340 242 306 328 294 272
## 2 338 332 426 478 372 392 374 430 388 354 368
## 3 226 328 274 258 220 236 272 322 284 274 356
## 4 350 350 324 286 322 280 256 218 256 220 356
## 5 236 306 238 350 206 260 280 274 318 268 210
## 6 334 282 292 282 300 290 302 300 306 294 444
# check how many rows and cols
dim(dta_schiz)
## [1] 17 30
summary(dta_schiz[1:11,])
## V1 V2 V3 V4 V5
## Min. :204.0 Min. :240.0 Min. :230 Min. :222.0 Min. :210.0
## 1st Qu.:242.0 1st Qu.:272.0 1st Qu.:280 1st Qu.:261.0 1st Qu.:267.0
## Median :260.0 Median :292.0 Median :306 Median :278.0 Median :302.0
## Mean :301.6 Mean :296.5 Mean :310 Mean :285.8 Mean :343.1
## 3rd Qu.:315.0 3rd Qu.:318.0 3rd Qu.:350 3rd Qu.:301.0 3rd Qu.:345.0
## Max. :590.0 Max. :364.0 Max. :384 Max. :364.0 Max. :778.0
## V6 V7 V8 V9
## Min. :254.0 Min. :232.0 Min. :228.0 Min. :208.0
## 1st Qu.:287.0 1st Qu.:253.0 1st Qu.:256.0 1st Qu.:272.0
## Median :310.0 Median :298.0 Median :316.0 Median :294.0
## Mean :317.3 Mean :292.9 Mean :314.2 Mean :285.6
## 3rd Qu.:335.0 3rd Qu.:320.0 3rd Qu.:366.0 3rd Qu.:310.0
## Max. :430.0 Max. :370.0 Max. :430.0 Max. :344.0
## V10 V11 V12 V13
## Min. :246.0 Min. :256.0 Min. :248.0 Min. :260.0
## 1st Qu.:283.0 1st Qu.:281.0 1st Qu.:253.0 1st Qu.:280.0
## Median :308.0 Median :296.0 Median :264.0 Median :308.0
## Mean :310.4 Mean :302.7 Mean :284.5 Mean :334.2
## 3rd Qu.:323.0 3rd Qu.:325.0 3rd Qu.:307.0 3rd Qu.:348.0
## Max. :402.0 Max. :362.0 Max. :372.0 Max. :542.0
## V14 V15 V16 V17
## Min. :236.0 Min. :238.0 Min. :268.0 Min. :254.0
## 1st Qu.:293.0 1st Qu.:303.0 1st Qu.:295.0 1st Qu.:296.0
## Median :336.0 Median :312.0 Median :330.0 Median :376.0
## Mean :332.4 Mean :320.2 Mean :336.5 Mean :357.8
## 3rd Qu.:376.0 3rd Qu.:343.0 3rd Qu.:358.0 3rd Qu.:394.0
## Max. :414.0 Max. :418.0 Max. :434.0 Max. :480.0
## V18 V19 V20 V21
## Min. :216.0 Min. :252.0 Min. :226.0 Min. :264.0
## 1st Qu.:267.0 1st Qu.:308.0 1st Qu.:282.0 1st Qu.:296.0
## Median :310.0 Median :336.0 Median :322.0 Median :328.0
## Mean :314.2 Mean :337.8 Mean :303.3 Mean :322.9
## 3rd Qu.:365.0 3rd Qu.:348.0 3rd Qu.:331.0 3rd Qu.:341.0
## Max. :422.0 Max. :460.0 Max. :350.0 Max. :392.0
## V22 V23 V24 V25 V26
## Min. :232.0 Min. :222.0 Min. :206.0 Min. :226.0 Min. :234
## 1st Qu.:251.0 1st Qu.:270.0 1st Qu.:235.0 1st Qu.:269.0 1st Qu.:264
## Median :284.0 Median :292.0 Median :262.0 Median :280.0 Median :290
## Mean :295.5 Mean :303.5 Mean :277.3 Mean :292.7 Mean :294
## 3rd Qu.:316.0 3rd Qu.:314.0 3rd Qu.:311.0 3rd Qu.:319.0 3rd Qu.:314
## Max. :426.0 Max. :478.0 Max. :372.0 Max. :392.0 Max. :374
## V27 V28 V29 V30
## Min. :208.0 Min. :232.0 Min. :206.0 Min. :206.0
## 1st Qu.:274.0 1st Qu.:257.0 1st Qu.:244.0 1st Qu.:258.0
## Median :286.0 Median :306.0 Median :294.0 Median :356.0
## Mean :294.4 Mean :305.1 Mean :305.1 Mean :333.6
## 3rd Qu.:314.0 3rd Qu.:337.0 3rd Qu.:353.0 3rd Qu.:374.0
## Max. :430.0 Max. :406.0 Max. :462.0 Max. :510.0
summary(dta_schiz[11:17,])
## V1 V2 V3 V4 V5
## Min. :240.0 Min. : 272 Min. : 264.0 Min. : 226 Min. :278
## 1st Qu.:325.0 1st Qu.: 340 1st Qu.: 320.0 1st Qu.: 256 1st Qu.:378
## Median :402.0 Median : 466 Median : 350.0 Median : 348 Median :460
## Mean :422.9 Mean : 516 Mean : 526.6 Mean : 452 Mean :478
## 3rd Qu.:524.0 3rd Qu.: 590 3rd Qu.: 423.0 3rd Qu.: 367 3rd Qu.:586
## Max. :620.0 Max. :1014 Max. :1586.0 Max. :1344 Max. :680
## V6 V7 V8 V9 V10
## Min. :286 Min. :260 Min. :264.0 Min. :298.0 Min. : 312.0
## 1st Qu.:442 1st Qu.:319 1st Qu.:330.0 1st Qu.:353.0 1st Qu.: 387.0
## Median :598 Median :478 Median :382.0 Median :524.0 Median : 410.0
## Mean :582 Mean :450 Mean :382.9 Mean :494.3 Mean : 530.3
## 3rd Qu.:734 3rd Qu.:501 3rd Qu.:437.0 3rd Qu.:592.0 3rd Qu.: 570.0
## Max. :838 Max. :772 Max. :500.0 Max. :748.0 Max. :1076.0
## V11 V12 V13 V14
## Min. :286.0 Min. :248.0 Min. :276.0 Min. : 354.0
## 1st Qu.:348.0 1st Qu.:329.0 1st Qu.:308.0 1st Qu.: 372.0
## Median :446.0 Median :378.0 Median :514.0 Median : 458.0
## Mean :492.3 Mean :455.1 Mean :500.3 Mean : 650.3
## 3rd Qu.:510.0 3rd Qu.:553.0 3rd Qu.:564.0 3rd Qu.: 658.0
## Max. :998.0 Max. :796.0 Max. :968.0 Max. :1680.0
## V15 V16 V17 V18 V19
## Min. : 262.0 Min. :324 Min. :256.0 Min. :260.0 Min. :280.0
## 1st Qu.: 308.0 1st Qu.:357 1st Qu.:341.0 1st Qu.:383.0 1st Qu.:310.0
## Median : 434.0 Median :404 Median :400.0 Median :422.0 Median :400.0
## Mean : 593.7 Mean :474 Mean :411.4 Mean :478.3 Mean :442.3
## 3rd Qu.: 572.0 3rd Qu.:497 3rd Qu.:489.0 3rd Qu.:590.0 3rd Qu.:540.0
## Max. :1700.0 Max. :882 Max. :564.0 Max. :720.0 Max. :716.0
## V20 V21 V22 V23 V24
## Min. :262.0 Min. :298 Min. :248.0 Min. : 250.0 Min. :234
## 1st Qu.:297.0 1st Qu.:359 1st Qu.:319.0 1st Qu.: 273.0 1st Qu.:323
## Median :366.0 Median :374 Median :350.0 Median : 420.0 Median :382
## Mean :349.1 Mean :392 Mean :439.7 Mean : 730.9 Mean :434
## 3rd Qu.:383.0 3rd Qu.:426 3rd Qu.:434.0 3rd Qu.:1093.0 3rd Qu.:514
## Max. :456.0 Max. :502 Max. :974.0 Max. :1714.0 Max. :748
## V25 V26 V27 V28
## Min. :280.0 Min. :302.0 Min. :282.0 Min. :234.0
## 1st Qu.:358.0 1st Qu.:327.0 1st Qu.:350.0 1st Qu.:293.0
## Median :440.0 Median :356.0 Median :544.0 Median :302.0
## Mean :415.7 Mean :478.3 Mean :539.4 Mean :344.6
## 3rd Qu.:482.0 3rd Qu.:613.0 3rd Qu.:633.0 3rd Qu.:415.0
## Max. :510.0 Max. :810.0 Max. :984.0 Max. :460.0
## V29 V30
## Min. :344.0 Min. :244.0
## 1st Qu.:388.0 1st Qu.:343.0
## Median :424.0 Median :432.0
## Mean :470.6 Mean :432.6
## 3rd Qu.:550.0 3rd Qu.:512.0
## Max. :650.0 Max. :642.0
The first table is the descriptive statistics for non-schizophrenics group. The second table is for schizophrenics group. This output is not easy to read, do you have any suggestions?