# data csv file import
worms <- read.csv(file = "Data Files/worms.csv", stringsAsFactors = FALSE)
# data xlsx file import
# install.packages("readxl")
library(readxl)
# Read the sheets, one by one
pop_sheet_1 <- read_excel("urbanpop.xlsx", sheet = 1)
pop_sheet_2 <- read_excel("urbanpop.xlsx", sheet = 2)
pop_sheet_3 <- read_excel("urbanpop.xlsx", sheet = 3)
# Put pop_1, pop_2 and pop_3 in a list: pop_list
pop_list = list(pop_sheet_1, pop_sheet_2, pop_sheet_3)
# Extend the cbind() call to include urban_sheet3: urban
urban <- cbind(pop_sheet_1, pop_sheet_2[-1], pop_sheet_3[-1])
sapply(urban, function(x){sum(is.na(x))})
## country 1960 1961 1962 1963 1964 1965 1966 1967
## 0 11 0 0 0 0 0 0 0
## 1968 1969 1970 1971 1972 1973 1974 1975 1976
## 0 0 0 0 0 0 0 0 0
## 1977 1978 1979 1980 1981 1982 1983 1984 1985
## 0 0 0 0 0 0 0 0 0
## 1986 1987 1988 1989 1990 1991 1992 1993 1994
## 0 0 0 0 0 0 1 1 1
## 1995 1996 1997 1998 1999 2000 2001 2002 2003
## 0 0 0 0 0 0 0 0 0
## 2004 2005 2006 2007 2008 2009 2010 2011
## 0 0 0 0 0 0 0 0
# Remove all rows with NAs from urban: urban_clean
urban_clean <- na.omit(urban)
sapply(urban_clean, function(x){sum(is.na(x))})
## country 1960 1961 1962 1963 1964 1965 1966 1967
## 0 0 0 0 0 0 0 0 0
## 1968 1969 1970 1971 1972 1973 1974 1975 1976
## 0 0 0 0 0 0 0 0 0
## 1977 1978 1979 1980 1981 1982 1983 1984 1985
## 0 0 0 0 0 0 0 0 0
## 1986 1987 1988 1989 1990 1991 1992 1993 1994
## 0 0 0 0 0 0 0 0 0
## 1995 1996 1997 1998 1999 2000 2001 2002 2003
## 0 0 0 0 0 0 0 0 0
## 2004 2005 2006 2007 2008 2009 2010 2011
## 0 0 0 0 0 0 0 0
# Read all Excel sheets with lapply(): pop_list
pop_list_lapply = lapply(excel_sheets("urbanpop.xlsx"), read_excel, path = "urbanpop.xlsx")
# Display the structure of pop_list
# Import the the first Excel sheet of urbanpop_nonames xlsx (R gives names): pop_a
pop_a <- read_excel("urbanpop_nonames.xlsx", col_names = FALSE, sheet = 1)
summary(pop_a)
## X__1 X__2 X__3
## Length:209 Min. : 3378 Min. : 1028
## Class :character 1st Qu.: 88978 1st Qu.: 70644
## Mode :character Median : 580675 Median : 570159
## Mean : 4988124 Mean : 4991613
## 3rd Qu.: 3077228 3rd Qu.: 2807280
## Max. :126469700 Max. :129268133
## NA's :11
## X__4 X__5 X__6
## Min. : 1090 Min. : 1154 Min. : 1218
## 1st Qu.: 74974 1st Qu.: 81870 1st Qu.: 84953
## Median : 593968 Median : 619331 Median : 645262
## Mean : 5141592 Mean : 5303711 Mean : 5468966
## 3rd Qu.: 2948396 3rd Qu.: 3148941 3rd Qu.: 3296444
## Max. :131974143 Max. :134599886 Max. :137205240
##
## X__7 X__8
## Min. : 1281 Min. : 1349
## 1st Qu.: 88633 1st Qu.: 93638
## Median : 679109 Median : 735139
## Mean : 5637394 Mean : 5790281
## 3rd Qu.: 3317422 3rd Qu.: 3418036
## Max. :139663053 Max. :141962708
##
# Import the the first Excel sheet of urbanpop_nonames xlsx (specify col_names): pop_b
cols <- c("country", paste0("year_", 1960:1966))
pop_b <- read_excel("urbanpop_nonames.xlsx", col_names = cols)
summary(pop_b)
## country year_1960 year_1961
## Length:209 Min. : 3378 Min. : 1028
## Class :character 1st Qu.: 88978 1st Qu.: 70644
## Mode :character Median : 580675 Median : 570159
## Mean : 4988124 Mean : 4991613
## 3rd Qu.: 3077228 3rd Qu.: 2807280
## Max. :126469700 Max. :129268133
## NA's :11
## year_1962 year_1963 year_1964
## Min. : 1090 Min. : 1154 Min. : 1218
## 1st Qu.: 74974 1st Qu.: 81870 1st Qu.: 84953
## Median : 593968 Median : 619331 Median : 645262
## Mean : 5141592 Mean : 5303711 Mean : 5468966
## 3rd Qu.: 2948396 3rd Qu.: 3148941 3rd Qu.: 3296444
## Max. :131974143 Max. :134599886 Max. :137205240
##
## year_1965 year_1966
## Min. : 1281 Min. : 1349
## 1st Qu.: 88633 1st Qu.: 93638
## Median : 679109 Median : 735139
## Mean : 5637394 Mean : 5790281
## 3rd Qu.: 3317422 3rd Qu.: 3418036
## Max. :139663053 Max. :141962708
##
# let's use worms again back.
head(worms, n = 3) # extract 3 obs
## Field.Name Area Slope Vegetation Soil.pH Damp Worm.density
## 1 Nashs.Field 3.6 11 Grassland 4.1 FALSE 4
## 2 Silwood.Bottom 5.1 2 Arable 5.2 FALSE 7
## 3 Nursery.Field 2.8 3 Grassland 4.3 FALSE 2
worms_attach <- worms
attach(worms_attach)
head(worms_attach, n = 3)
## Field.Name Area Slope Vegetation Soil.pH Damp Worm.density
## 1 Nashs.Field 3.6 11 Grassland 4.1 FALSE 4
## 2 Silwood.Bottom 5.1 2 Arable 5.2 FALSE 7
## 3 Nursery.Field 2.8 3 Grassland 4.3 FALSE 2
# why we use attach? in this book? Let's compare both dataset,
mean(Area)
## [1] 2.99
mean(worms$Area)
## [1] 2.99
# Create Missing data
das_df <- read.csv("Data Files/das.csv", stringsAsFactors = FALSE)
attach(das_df)
das_df$y[c(1,3,5,7,9,10)] <- NA
sum(is.na(das_df)) ## easy
## [1] 6
# how about this dataset?
missing_df <- data.frame(a = 1:5, b = c(1,2,NA,4,NA), c = c(NA,2,3,NA,NA))
missing_df # How to count missing values by column c(a,b,c)
## a b c
## 1 1 1 NA
## 2 2 2 2
## 3 3 NA 3
## 4 4 4 NA
## 5 5 NA NA
# Case 1
cols <- colnames(missing_df)
missing <- lapply(missing_df[, cols], is.na)
missing
## $a
## [1] FALSE FALSE FALSE FALSE FALSE
##
## $b
## [1] FALSE FALSE TRUE FALSE TRUE
##
## $c
## [1] TRUE FALSE FALSE TRUE TRUE
num_missing <- sapply(missing, sum)
num_missing
## a b c
## 0 2 3
# Case 2
apply(missing_df, MARGIN = 2, function(x){sum(is.na(x))}) # What is Margin?
## a b c
## 0 2 3
# Case 3
sapply(missing_df, function(x){sum(is.na(x))})
## a b c
## 0 2 3
At this momement, I want to skip for this chapter.
# Selecting 1:3 rows and all columns
worms[1:3, ]
## Field.Name Area Slope Vegetation Soil.pH Damp Worm.density
## 1 Nashs.Field 3.6 11 Grassland 4.1 FALSE 4
## 2 Silwood.Bottom 5.1 2 Arable 5.2 FALSE 7
## 3 Nursery.Field 2.8 3 Grassland 4.3 FALSE 2
# Selecting 1:3 columns and all rows
worms[, 1:3]
## Field.Name Area Slope
## 1 Nashs.Field 3.6 11
## 2 Silwood.Bottom 5.1 2
## 3 Nursery.Field 2.8 3
## 4 Rush.Meadow 2.4 5
## 5 Gunness.Thicket 3.8 0
## 6 Oak.Mead 3.1 2
## 7 Church.Field 3.5 3
## 8 Ashurst 2.1 0
## 9 The.Orchard 1.9 0
## 10 Rookery.Slope 1.5 4
## 11 Garden.Wood 2.9 10
## 12 North.Gravel 3.3 1
## 13 South.Gravel 3.7 2
## 14 Observatory.Ridge 1.8 6
## 15 Pond.Field 4.1 0
## 16 Water.Meadow 3.9 0
## 17 Cheapside 2.2 8
## 18 Pound.Hill 4.4 2
## 19 Gravel.Pit 2.9 1
## 20 Farm.Wood 0.8 10
# area > 3 그리고 slope < 3 만 추출하세요.
worms[Area > 3 & Slope < 3, ]
## Field.Name Area Slope Vegetation Soil.pH Damp Worm.density
## 2 Silwood.Bottom 5.1 2 Arable 5.2 FALSE 7
## 5 Gunness.Thicket 3.8 0 Scrub 4.2 FALSE 6
## 6 Oak.Mead 3.1 2 Grassland 3.9 FALSE 2
## 12 North.Gravel 3.3 1 Grassland 4.1 FALSE 1
## 13 South.Gravel 3.7 2 Grassland 4.0 FALSE 2
## 15 Pond.Field 4.1 0 Meadow 5.0 TRUE 6
## 16 Water.Meadow 3.9 0 Meadow 4.9 TRUE 8
## 18 Pound.Hill 4.4 2 Arable 4.5 FALSE 5
# Sorting
worms[order(Area), ] # Order by Area from low numbers to high numbers and extract all columns
## Field.Name Area Slope Vegetation Soil.pH Damp Worm.density
## 20 Farm.Wood 0.8 10 Scrub 5.1 TRUE 3
## 10 Rookery.Slope 1.5 4 Grassland 5.0 TRUE 7
## 14 Observatory.Ridge 1.8 6 Grassland 3.8 FALSE 0
## 9 The.Orchard 1.9 0 Orchard 5.7 FALSE 9
## 8 Ashurst 2.1 0 Arable 4.8 FALSE 4
## 17 Cheapside 2.2 8 Scrub 4.7 TRUE 4
## 4 Rush.Meadow 2.4 5 Meadow 4.9 TRUE 5
## 3 Nursery.Field 2.8 3 Grassland 4.3 FALSE 2
## 11 Garden.Wood 2.9 10 Scrub 5.2 FALSE 8
## 19 Gravel.Pit 2.9 1 Grassland 3.5 FALSE 1
## 6 Oak.Mead 3.1 2 Grassland 3.9 FALSE 2
## 12 North.Gravel 3.3 1 Grassland 4.1 FALSE 1
## 7 Church.Field 3.5 3 Grassland 4.2 FALSE 3
## 1 Nashs.Field 3.6 11 Grassland 4.1 FALSE 4
## 13 South.Gravel 3.7 2 Grassland 4.0 FALSE 2
## 5 Gunness.Thicket 3.8 0 Scrub 4.2 FALSE 6
## 16 Water.Meadow 3.9 0 Meadow 4.9 TRUE 8
## 15 Pond.Field 4.1 0 Meadow 5.0 TRUE 6
## 18 Pound.Hill 4.4 2 Arable 4.5 FALSE 5
## 2 Silwood.Bottom 5.1 2 Arable 5.2 FALSE 7
worms[rev(order(Area)), c(1,2)] # Order by from high numbers to low numbers from low numbers to high numbers
## Field.Name Area
## 2 Silwood.Bottom 5.1
## 18 Pound.Hill 4.4
## 15 Pond.Field 4.1
## 16 Water.Meadow 3.9
## 5 Gunness.Thicket 3.8
## 13 South.Gravel 3.7
## 1 Nashs.Field 3.6
## 7 Church.Field 3.5
## 12 North.Gravel 3.3
## 6 Oak.Mead 3.1
## 19 Gravel.Pit 2.9
## 11 Garden.Wood 2.9
## 3 Nursery.Field 2.8
## 4 Rush.Meadow 2.4
## 17 Cheapside 2.2
## 8 Ashurst 2.1
## 9 The.Orchard 1.9
## 14 Observatory.Ridge 1.8
## 10 Rookery.Slope 1.5
## 20 Farm.Wood 0.8
worms[order(Slope), c(1,2,3)] # Order_by and extract 1:3 columns
## Field.Name Area Slope
## 5 Gunness.Thicket 3.8 0
## 8 Ashurst 2.1 0
## 9 The.Orchard 1.9 0
## 15 Pond.Field 4.1 0
## 16 Water.Meadow 3.9 0
## 12 North.Gravel 3.3 1
## 19 Gravel.Pit 2.9 1
## 2 Silwood.Bottom 5.1 2
## 6 Oak.Mead 3.1 2
## 13 South.Gravel 3.7 2
## 18 Pound.Hill 4.4 2
## 3 Nursery.Field 2.8 3
## 7 Church.Field 3.5 3
## 10 Rookery.Slope 1.5 4
## 4 Rush.Meadow 2.4 5
## 14 Observatory.Ridge 1.8 6
## 17 Cheapside 2.2 8
## 11 Garden.Wood 2.9 10
## 20 Farm.Wood 0.8 10
## 1 Nashs.Field 3.6 11
worms$Vegetation <- as.factor(worms$Vegetation)
summary(worms) # Check Vegetation
## Field.Name Area Slope Vegetation
## Length:20 Min. :0.800 Min. : 0.00 Arable :3
## Class :character 1st Qu.:2.175 1st Qu.: 0.75 Grassland:9
## Mode :character Median :3.000 Median : 2.00 Meadow :3
## Mean :2.990 Mean : 3.50 Orchard :1
## 3rd Qu.:3.725 3rd Qu.: 5.25 Scrub :4
## Max. :5.100 Max. :11.00
## Soil.pH Damp Worm.density
## Min. :3.500 Mode :logical Min. :0.00
## 1st Qu.:4.100 FALSE:14 1st Qu.:2.00
## Median :4.600 TRUE :6 Median :4.00
## Mean :4.555 Mean :4.35
## 3rd Qu.:5.000 3rd Qu.:6.25
## Max. :5.700 Max. :9.00
# Sample mtcars
mtcars_df <- mtcars
mtcars_df$am <- as.factor(mtcars_df$am) # am = 0 manual, am = 1 auto
mtcars_df$cyl <- as.factor(mtcars_df$cyl)
summary(mtcars_df)
## mpg cyl disp hp drat
## Min. :10.40 4:11 Min. : 71.1 Min. : 52.0 Min. :2.760
## 1st Qu.:15.43 6: 7 1st Qu.:120.8 1st Qu.: 96.5 1st Qu.:3.080
## Median :19.20 8:14 Median :196.3 Median :123.0 Median :3.695
## Mean :20.09 Mean :230.7 Mean :146.7 Mean :3.597
## 3rd Qu.:22.80 3rd Qu.:326.0 3rd Qu.:180.0 3rd Qu.:3.920
## Max. :33.90 Max. :472.0 Max. :335.0 Max. :4.930
## wt qsec vs am gear
## Min. :1.513 Min. :14.50 Min. :0.0000 0:19 Min. :3.000
## 1st Qu.:2.581 1st Qu.:16.89 1st Qu.:0.0000 1:13 1st Qu.:3.000
## Median :3.325 Median :17.71 Median :0.0000 Median :4.000
## Mean :3.217 Mean :17.85 Mean :0.4375 Mean :3.688
## 3rd Qu.:3.610 3rd Qu.:18.90 3rd Qu.:1.0000 3rd Qu.:4.000
## Max. :5.424 Max. :22.90 Max. :1.0000 Max. :5.000
## carb
## Min. :1.000
## 1st Qu.:2.000
## Median :2.000
## Mean :2.812
## 3rd Qu.:4.000
## Max. :8.000
by(mtcars_df, mtcars_df$cyl, summary)
## mtcars_df$cyl: 4
## mpg cyl disp hp drat
## Min. :21.40 4:11 Min. : 71.10 Min. : 52.00 Min. :3.690
## 1st Qu.:22.80 6: 0 1st Qu.: 78.85 1st Qu.: 65.50 1st Qu.:3.810
## Median :26.00 8: 0 Median :108.00 Median : 91.00 Median :4.080
## Mean :26.66 Mean :105.14 Mean : 82.64 Mean :4.071
## 3rd Qu.:30.40 3rd Qu.:120.65 3rd Qu.: 96.00 3rd Qu.:4.165
## Max. :33.90 Max. :146.70 Max. :113.00 Max. :4.930
## wt qsec vs am gear
## Min. :1.513 Min. :16.70 Min. :0.0000 0:3 Min. :3.000
## 1st Qu.:1.885 1st Qu.:18.56 1st Qu.:1.0000 1:8 1st Qu.:4.000
## Median :2.200 Median :18.90 Median :1.0000 Median :4.000
## Mean :2.286 Mean :19.14 Mean :0.9091 Mean :4.091
## 3rd Qu.:2.623 3rd Qu.:19.95 3rd Qu.:1.0000 3rd Qu.:4.000
## Max. :3.190 Max. :22.90 Max. :1.0000 Max. :5.000
## carb
## Min. :1.000
## 1st Qu.:1.000
## Median :2.000
## Mean :1.545
## 3rd Qu.:2.000
## Max. :2.000
## --------------------------------------------------------
## mtcars_df$cyl: 6
## mpg cyl disp hp drat
## Min. :17.80 4:0 Min. :145.0 Min. :105.0 Min. :2.760
## 1st Qu.:18.65 6:7 1st Qu.:160.0 1st Qu.:110.0 1st Qu.:3.350
## Median :19.70 8:0 Median :167.6 Median :110.0 Median :3.900
## Mean :19.74 Mean :183.3 Mean :122.3 Mean :3.586
## 3rd Qu.:21.00 3rd Qu.:196.3 3rd Qu.:123.0 3rd Qu.:3.910
## Max. :21.40 Max. :258.0 Max. :175.0 Max. :3.920
## wt qsec vs am gear
## Min. :2.620 Min. :15.50 Min. :0.0000 0:4 Min. :3.000
## 1st Qu.:2.822 1st Qu.:16.74 1st Qu.:0.0000 1:3 1st Qu.:3.500
## Median :3.215 Median :18.30 Median :1.0000 Median :4.000
## Mean :3.117 Mean :17.98 Mean :0.5714 Mean :3.857
## 3rd Qu.:3.440 3rd Qu.:19.17 3rd Qu.:1.0000 3rd Qu.:4.000
## Max. :3.460 Max. :20.22 Max. :1.0000 Max. :5.000
## carb
## Min. :1.000
## 1st Qu.:2.500
## Median :4.000
## Mean :3.429
## 3rd Qu.:4.000
## Max. :6.000
## --------------------------------------------------------
## mtcars_df$cyl: 8
## mpg cyl disp hp drat
## Min. :10.40 4: 0 Min. :275.8 Min. :150.0 Min. :2.760
## 1st Qu.:14.40 6: 0 1st Qu.:301.8 1st Qu.:176.2 1st Qu.:3.070
## Median :15.20 8:14 Median :350.5 Median :192.5 Median :3.115
## Mean :15.10 Mean :353.1 Mean :209.2 Mean :3.229
## 3rd Qu.:16.25 3rd Qu.:390.0 3rd Qu.:241.2 3rd Qu.:3.225
## Max. :19.20 Max. :472.0 Max. :335.0 Max. :4.220
## wt qsec vs am gear
## Min. :3.170 Min. :14.50 Min. :0 0:12 Min. :3.000
## 1st Qu.:3.533 1st Qu.:16.10 1st Qu.:0 1: 2 1st Qu.:3.000
## Median :3.755 Median :17.18 Median :0 Median :3.000
## Mean :3.999 Mean :16.77 Mean :0 Mean :3.286
## 3rd Qu.:4.014 3rd Qu.:17.55 3rd Qu.:0 3rd Qu.:3.000
## Max. :5.424 Max. :18.00 Max. :0 Max. :5.000
## carb
## Min. :2.00
## 1st Qu.:2.25
## Median :3.50
## Mean :3.50
## 3rd Qu.:4.00
## Max. :8.00
Let’s take a look below.
worms$Vegetation <- as.factor(worms$Vegetation)
tapply(worms[, 7], worms$Vegetation, mean)
## Arable Grassland Meadow Orchard Scrub
## 5.333333 2.444444 6.333333 9.000000 5.250000
with(worms,tapply(Worm.density,Vegetation,mean))
## Arable Grassland Meadow Orchard Scrub
## 5.333333 2.444444 6.333333 9.000000 5.250000
tapply(worms[, 2], worms$Vegetation, mean)
## Arable Grassland Meadow Orchard Scrub
## 3.866667 2.911111 3.466667 1.900000 2.425000
tapply(worms[, 3], worms$Vegetation, mean)
## Arable Grassland Meadow Orchard Scrub
## 1.333333 3.666667 1.666667 0.000000 7.000000
tapply(worms[, 5], worms$Vegetation, mean)
## Arable Grassland Meadow Orchard Scrub
## 4.833333 4.100000 4.933333 5.700000 4.800000
tapply(worms[, 7], worms$Vegetation, mean)
## Arable Grassland Meadow Orchard Scrub
## 5.333333 2.444444 6.333333 9.000000 5.250000
by(worms, worms$Vegetation, function(x){
means <- colMeans(x[,c(2,3,5,7)])
})
## worms$Vegetation: Arable
## Area Slope Soil.pH Worm.density
## 3.866667 1.333333 4.833333 5.333333
## --------------------------------------------------------
## worms$Vegetation: Grassland
## Area Slope Soil.pH Worm.density
## 2.911111 3.666667 4.100000 2.444444
## --------------------------------------------------------
## worms$Vegetation: Meadow
## Area Slope Soil.pH Worm.density
## 3.466667 1.666667 4.933333 6.333333
## --------------------------------------------------------
## worms$Vegetation: Orchard
## Area Slope Soil.pH Worm.density
## 1.9 0.0 5.7 9.0
## --------------------------------------------------------
## worms$Vegetation: Scrub
## Area Slope Soil.pH Worm.density
## 2.425 7.000 4.800 5.250
worms_df <- worms
worms_df$Vegetation <- as.factor(worms_df$Vegetation)
par(mfrow=c(1,5)) #
levels(worms_df$Vegetation)
## [1] "Arable" "Grassland" "Meadow" "Orchard" "Scrub"
by(worms_df, worms_df$Vegetation, function(x){
# draw a plot for each vegetation
plot(x$Area, x$Soil.pH, xlab = "Area", ylab = "Soil.PH", xlim = c(1,10), ylim = c(1,10))
})
## worms_df$Vegetation: Arable
## NULL
## --------------------------------------------------------
## worms_df$Vegetation: Grassland
## NULL
## --------------------------------------------------------
## worms_df$Vegetation: Meadow
## NULL
## --------------------------------------------------------
## worms_df$Vegetation: Orchard
## NULL
## --------------------------------------------------------
## worms_df$Vegetation: Scrub
## NULL
data("BOD")
class(apply(BOD, 1, sum)) # Sum up for each rows and also returns list
## [1] "numeric"
apply(BOD, 2, sum) # Sum up for columns
## Time demand
## 22 89
apply(BOD, 1, function(x) 10 * x) # Multipy all values by 10:
## [,1] [,2] [,3] [,4] [,5] [,6]
## Time 10 20 30 40 50 70
## demand 83 103 190 160 156 198
lapply(BOD, sum)
## $Time
## [1] 22
##
## $demand
## [1] 89
class(lapply(BOD, sum)) # it returns list
## [1] "list"
sapply(BOD, sum)
## Time demand
## 22 89
class(sapply(BOD, sum)) # it returns vector
## [1] "numeric"
tapply(BOD$Time, BOD$demand, mean) # if no categorical value, it's not good.
## 8.3 10.3 15.6 16 19 19.8
## 1 2 5 4 3 7
medical.example <-
data.frame(patient = 1:100,
age = rnorm(100, mean = 60, sd = 12),
treatment = gl(2, 50,
labels = c("Treatment", "Control")))
summary(medical.example)
## patient age treatment
## Min. : 1.00 Min. :23.93 Treatment:50
## 1st Qu.: 25.75 1st Qu.:52.84 Control :50
## Median : 50.50 Median :58.80
## Mean : 50.50 Mean :59.97
## 3rd Qu.: 75.25 3rd Qu.:68.50
## Max. :100.00 Max. :87.07
attach(medical.example)
tapply(age, treatment, mean) # Group function
## Treatment Control
## 63.18833 56.75904
The tapply function is useful when we need to break up a vector into groups defined by some classifying factor, compute a function on the subsets, and return the results in a convenient form.
example (5) aggregate
aggregate(worms[, c(2,3,5,7)], list(Community = worms$Vegetation), mean)
## Community Area Slope Soil.pH Worm.density
## 1 Arable 3.866667 1.333333 4.833333 5.333333
## 2 Grassland 2.911111 3.666667 4.100000 2.444444
## 3 Meadow 3.466667 1.666667 4.933333 6.333333
## 4 Orchard 1.900000 0.000000 5.700000 9.000000
## 5 Scrub 2.425000 7.000000 4.800000 5.250000
These two articles could be best one ever to explain what grouping functions are.
das <- read.csv("Data Files/das.csv", stringsAsFactors = FALSE)
attach(das)
## The following object is masked from das_df:
##
## y
plot(y)
scatter_df <- read.csv(file = "Data Files/scatter.csv")
attach(scatter_df)
## The following object is masked from das:
##
## y
## The following object is masked from das_df:
##
## y
head(scatter_df)
## x y
## 1 0.000000 0.00000
## 2 5.112000 61.04000
## 3 1.320000 11.11130
## 4 35.240000 140.65000
## 5 1.632931 26.15218
## 6 2.297635 10.00100
plot(x,y, pch = 21, bg = "red") # what do you see?
weather_df <- read.csv("Data Files/weather.data.csv")
attach(weather_df)
plot(factor(month), upper)
np_df <- read.csv(file = "Data Files/np.csv")
str(np_df)
## 'data.frame': 40 obs. of 3 variables:
## $ yield : num 0.827 3.613 2.619 1.741 0.659 ...
## $ nitrogen : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ phosphorus: Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
attach(np_df)
par(mfrow = c(1,2))
plot(nitrogen, yield, main = "Nitrogen")
plot(phosphorus, yield, main = "Phosphorus")
tapply(yield, list(nitrogen, phosphorus), mean)
## no yes
## no 1.47384 1.875928
## yes 2.28999 3.480184
barplot(tapply(yield,list(nitrogen,phosphorus),mean),
beside = TRUE,
xlab ="phosphorus",
ylab = "yield"
)