2 File I/O

2.1 Read

cereal=read.csv("Cereal.csv")

2.2 Data frames

head(cereal)
##                        name mfr type calories protein fat sodium fiber carbo
## 1                 100%_Bran   N    C       70       4   1    130  10.0   5.0
## 2         100%_Natural_Bran   Q    C      120       3   5     15   2.0   8.0
## 3                  All-Bran   K    C       70       4   1    260   9.0   7.0
## 4 All-Bran_with_Extra_Fiber   K    C       50       4   0    140  14.0   8.0
## 5            Almond_Delight   R    C      110       2   2    200   1.0  14.0
## 6   Apple_Cinnamon_Cheerios   G    C      110       2   2    180   1.5  10.5
##   sugars potass vitamins shelf weight cups   rating
## 1      6    280       25     3      1 0.33 68.40297
## 2      8    135        0     3      1 1.00 33.98368
## 3      5    320       25     3      1 0.33 59.42551
## 4      0    330       25     3      1 0.50 93.70491
## 5      8     -1       25     3      1 0.75 34.38484
## 6     10     70       25     1      1 0.75 29.50954
class(cereal)
## [1] "data.frame"
dimnames(cereal)
## [[1]]
##  [1] "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10" "11" "12" "13" "14" "15"
## [16] "16" "17" "18" "19" "20" "21" "22" "23" "24" "25" "26" "27" "28" "29" "30"
## [31] "31" "32" "33" "34" "35" "36" "37" "38" "39" "40" "41" "42" "43" "44" "45"
## [46] "46" "47" "48" "49" "50" "51" "52" "53" "54" "55" "56" "57" "58" "59" "60"
## [61] "61" "62" "63" "64" "65" "66" "67" "68" "69" "70" "71" "72" "73" "74" "75"
## [76] "76" "77"
## 
## [[2]]
##  [1] "name"     "mfr"      "type"     "calories" "protein"  "fat"     
##  [7] "sodium"   "fiber"    "carbo"    "sugars"   "potass"   "vitamins"
## [13] "shelf"    "weight"   "cups"     "rating"
nrow(cereal)
## [1] 77
cereal$calories
##  [1]  70 120  70  50 110 110 110 130  90  90 120 110 120 110 110 110 100 110 110
## [20] 110 100 110 100 100 110 110 100 120 120 110 100 110 100 110 120 120 110 110
## [39] 110 140 110 100 110 100 150 150 160 100 120 140  90 130 120 100  50  50 100
## [58] 100 120 100  90 110 110  80  90  90 110 110  90 110 140 100 110 110 100 100
## [77] 110
cereal[["calories"]]
##  [1]  70 120  70  50 110 110 110 130  90  90 120 110 120 110 110 110 100 110 110
## [20] 110 100 110 100 100 110 110 100 120 120 110 100 110 100 110 120 120 110 110
## [39] 110 140 110 100 110 100 150 150 160 100 120 140  90 130 120 100  50  50 100
## [58] 100 120 100  90 110 110  80  90  90 110 110  90 110 140 100 110 110 100 100
## [77] 110
cereal[1: 10,]
##                         name mfr type calories protein fat sodium fiber carbo
## 1                  100%_Bran   N    C       70       4   1    130  10.0   5.0
## 2          100%_Natural_Bran   Q    C      120       3   5     15   2.0   8.0
## 3                   All-Bran   K    C       70       4   1    260   9.0   7.0
## 4  All-Bran_with_Extra_Fiber   K    C       50       4   0    140  14.0   8.0
## 5             Almond_Delight   R    C      110       2   2    200   1.0  14.0
## 6    Apple_Cinnamon_Cheerios   G    C      110       2   2    180   1.5  10.5
## 7                Apple_Jacks   K    C      110       2   0    125   1.0  11.0
## 8                    Basic_4   G    C      130       3   2    210   2.0  18.0
## 9                  Bran_Chex   R    C       90       2   1    200   4.0  15.0
## 10               Bran_Flakes   P    C       90       3   0    210   5.0  13.0
##    sugars potass vitamins shelf weight cups   rating
## 1       6    280       25     3   1.00 0.33 68.40297
## 2       8    135        0     3   1.00 1.00 33.98368
## 3       5    320       25     3   1.00 0.33 59.42551
## 4       0    330       25     3   1.00 0.50 93.70491
## 5       8     -1       25     3   1.00 0.75 34.38484
## 6      10     70       25     1   1.00 0.75 29.50954
## 7      14     30       25     2   1.00 1.00 33.17409
## 8       8    100       25     3   1.33 0.75 37.03856
## 9       6    125       25     1   1.00 0.67 49.12025
## 10      5    190       25     3   1.00 0.67 53.31381
Kelloggs<-subset(cereal,mfr=="K")

2.3 Factors

cereal=read.csv("Cereal.csv",stringsAsFactors = TRUE)
class(cereal$mfr)
## [1] "factor"
class(cereal$type)
## [1] "factor"
is.factor(cereal$mfr)
## [1] TRUE
is.factor(cereal$type)
## [1] TRUE
nlevels(cereal$mfr)
## [1] 7
nlevels(cereal$type)
## [1] 2

2.4 Vectors

cereal.calories <- cereal$calories
length(cereal.calories)
## [1] 77
cereal.calories[5:10]
## [1] 110 110 110 130  90  90
cereal.calories <-c(cereal.calories,c(55))

2.5 Matrix

a=as.matrix(cereal)
str(a)
##  chr [1:77, 1:16] "100%_Bran" "100%_Natural_Bran" "All-Bran" ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : NULL
##   ..$ : chr [1:16] "name" "mfr" "type" "calories" ...
b <- as.matrix(cereal[,-grep("mfr|name|type",colnames(cereal))])
str(b)
##  num [1:77, 1:13] 70 120 70 50 110 110 110 130 90 90 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : NULL
##   ..$ : chr [1:13] "calories" "protein" "fat" "sodium" ...

3 Numerical summary

3.1 Summary

summary(cereal$sodium)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0   130.0   180.0   159.7   210.0   320.0

As we can see, the median, 1st quartile and 3rd quartile are 180.0, 130.0 and 210.0 in the sodium column respectively.

3.2 Basic statistics

max(cereal$sodium)
## [1] 320
min(cereal$sodium)
## [1] 0
sd(cereal$sodium)
## [1] 83.8323
mean(cereal$sodium)
## [1] 159.6753

Find the mean sodium of each mfr. method 1:

aggregate(cereal$sodium, list(cereal$mfr), mean)
##   Group.1        x
## 1       A   0.0000
## 2       G 200.4545
## 3       K 174.7826
## 4       N  37.5000
## 5       P 146.1111
## 6       Q  92.5000
## 7       R 198.1250

method 2:

tapply(cereal$sodium,cereal$mfr,mean)
##        A        G        K        N        P        Q        R 
##   0.0000 200.4545 174.7826  37.5000 146.1111  92.5000 198.1250

4 Graphical summary

4.1 Boxplot

plotdata=subset(cereal, select=c(sodium,mfr))
boxplot(sodium ~ mfr, data = plotdata, xlab = "mfr",
        ylab = "sodium", main = "mfr boxplot")

4.2 plot?

plotdata2=subset(cereal, select=c(sodium,calories))
plot(sodium ~ calories, data = plotdata2, main = "calories distribution") 

5 Write Data to File

write.csv(x = Kelloggs,file = "kelloggs.csv")

APP. Student Info

Course: STAT5003_Computational Statistical Methods
Assignment: Lab Week 1
Student Name: Yujun Yao(June Yao)
SID: 500316995
Email: