This report was automatically generated with the R package knitr
(version 0.8).
install.packages("knitr")
## Installing package(s) into 'C:/Users/Tom/Documents/R/win-library/2.14' (as
## 'lib' is unspecified)
## Warning in install.packages :
## package 'knitr' is in use and will not be installed
library("knitr")
setwd("F:/Documents/Work/Courses/Stats_LUND_2012_October/Exercises/Ex_01")
tits <- read.csv("tits.csv")
names(tits)
## [1] "spe" "wei" "egg"
tits
## spe wei egg
## 1 TX 17.3 10
## 2 TX 17.2 6
## 3 TX 14.5 9
## 4 TX 17.8 4
## 5 TX 18.0 7
## 6 TX 18.3 8
## 7 BM 9.3 10
## 8 TX 15.1 9
## 9 TX NA 10
## 10 TX 15.8 10
## 11 TX 16.9 7
## 12 TX 16.2 9
## 13 BM 11.0 11
## 14 TX 16.8 7
## 15 TX 16.3 11
## 16 TX NA 7
## 17 BM 11.8 13
## 18 TX 16.5 11
## 19 TX NA 6
## 20 BM 11.9 12
## 21 TX 17.8 9
## 22 BM 10.6 10
## 23 TX 17.6 6
## 24 TX 18.8 10
## 25 TX 19.2 11
## 26 TX 16.5 9
## 27 BM 11.9 11
## 28 TX NA 9
## 29 BM 11.9 10
## 30 TX 19.5 8
## 31 BM 10.8 13
## 32 TX 18.1 8
## 33 TX 18.6 9
## 34 TX 18.8 9
## 35 TX NA 8
## 36 TX NA 7
## 37 TX 19.2 9
## 38 TX NA 8
## 39 TX NA 8
## 40 TX NA 5
## 41 BM 11.2 12
## 42 TX 18.8 7
## 43 BM 12.3 13
## 44 TX 19.3 11
## 45 TX NA 10
## 46 TX 18.7 7
## 47 BM 11.3 12
## 48 TX NA 9
## 49 TX 18.7 10
## 50 TX 17.5 7
## 51 TX NA 6
## 52 TX NA 6
## 53 TX NA 9
## 54 TX NA 8
## 55 TX 16.9 8
## 56 TX 18.0 11
## 57 TX 18.3 10
## 58 TX 17.6 10
## 59 TX NA 8
## 60 BM 8.3 8
## 61 TX NA 9
## 62 TX 17.5 8
## 63 TX NA 11
## 64 TX 17.5 11
## 65 TX 19.5 8
## 66 BM 11.1 13
## 67 TX 18.8 11
## 68 TX 15.9 7
## 69 TX NA 9
## 70 TX 17.0 7
## 71 TX 16.6 9
## 72 TX 16.8 9
## 73 BM 11.7 9
## 74 BM 11.9 13
## 75 BM 12.0 11
## 76 TX 18.6 12
## 77 TX NA 7
## 78 TX 18.3 9
## 79 BM NA 7
## 80 TX NA 6
## 81 BM 11.7 11
## 82 TX 15.8 11
## 83 TX 13.2 10
## 84 TX NA 6
## 85 BM NA 10
## 86 TX NA 9
## 87 BM NA 10
## 88 BM 10.2 12
## 89 TX 13.4 9
## 90 TX 17.3 11
factor(tits$spe) #to declare a variable as a factor - using read.csv this is automatically done, however if that were not the case, you can use this.
## [1] TX TX TX TX TX TX BM TX TX TX TX TX BM TX TX TX BM TX TX BM TX BM TX
## [24] TX TX TX BM TX BM TX BM TX TX TX TX TX TX TX TX TX BM TX BM TX TX TX
## [47] BM TX TX TX TX TX TX TX TX TX TX TX TX BM TX TX TX TX TX BM TX TX TX
## [70] TX TX TX BM BM BM TX TX TX BM TX BM TX TX TX BM TX BM BM TX TX
## Levels: BM TX
# 5.
tits$names <- ifelse(tits$spe == "TX", "Great tit", "Blue tit") #create new vairable containing full common names for species, as only two can use ifelse command
# 6. alternatively do one at a time, if you have more than two
# possibilties
tits$names[tits$spe == "TX"] <- "Great tit"
tits$names[tits$spe == "BM"] <- "Blue tit"
# 7. some summary statistics
tits$names <- as.factor(tits$names) #make names a factor first
summary(tits)
## spe wei egg names
## BM:21 Min. : 8.3 Min. : 4.00 Blue tit :21
## TX:69 1st Qu.:12.0 1st Qu.: 8.00 Great tit:69
## Median :16.9 Median : 9.00
## Mean :15.7 Mean : 9.12
## 3rd Qu.:18.1 3rd Qu.:11.00
## Max. :19.5 Max. :13.00
## NA's :26.0
mean(tits$egg)
## [1] 9.122
sapply(tits, mean, na.rm = TRUE) #use sapply, applying mean function to each column of data frame
## Warning: argument is not numeric or logical: returning NA
## Warning: argument is not numeric or logical: returning NA
## spe wei egg names
## NA 15.652 9.122 NA
# 8. To get values by species, use 'describe.by' function, available in
# package 'psych'
install.packages("psych")
## Installing package(s) into 'C:/Users/Tom/Documents/R/win-library/2.14' (as
## 'lib' is unspecified)
## package 'psych' successfully unpacked and MD5 sums checked
##
## The downloaded packages are in
## C:\Users\Tom\AppData\Local\Temp\RtmpCAEA9u\downloaded_packages
library("psych")
## Warning: package 'psych' was built under R version 2.14.2
describe.by(tits, tits$names) #each vairable is summarised, subset by species
## Error: could not find function "describe.by"
# 9. t-tests to compare the species, using form t.test(y~x)
t.test(tits$wei ~ tits$names)
##
## Welch Two Sample t-test
##
## data: tits$wei by tits$names
## t = -19.1, df = 44.22, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -6.907 -5.588
## sample estimates:
## mean in group Blue tit mean in group Great tit
## 11.16 17.41
t.test(tits$egg ~ tits$names)
##
## Welch Two Sample t-test
##
## data: tits$egg by tits$names
## t = 5.751, df = 33.53, p-value = 1.901e-06
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 1.583 3.315
## sample estimates:
## mean in group Blue tit mean in group Great tit
## 11.000 8.551
# both are significantly different
# 10. scatterplot of egg by weight
plot(tits$wei, tits$eg, col = as.numeric(tits$spe)) #colouring data points by species.
# 11. subsetting data by species:
great <- subset(tits, tits$spe == "TX")
blue <- subset(tits, tits$spe == "BM")
# 12. now build up plot above, but adding the data from one afterwards:
plot(great$egg ~ great$wei, xlab = "weight", ylab = "eggs", xlim = range(tits$wei,
na.rm = T), ylim = range(tits$egg, na.rm = T))
points(blue$egg ~ blue$wei, col = "red")
lm.1 <- lm(tits$egg ~ tits$wei) #make a regression object, used to apply a best fit line
abline(lm.1, lty = 2, lwd = 2) #add the above regression line to the plot
# 13. can change colour of line with lwd, lty, col etc.
abline(lm.1, lwd = 4)
abline(lm.1, lwd = 2, col = "red")
abline(lm.1, lty = 2)
# 14. read in new data-set
xydata <- read.csv("xydata.csv")
names(xydata)
## [1] "x" "y"
summary(xydata)
## x y
## Min. : 3.61 Min. :3.32
## 1st Qu.:21.25 1st Qu.:4.55
## Median :29.06 Median :4.78
## Mean :27.91 Mean :4.69
## 3rd Qu.:34.09 3rd Qu.:4.92
## Max. :48.32 Max. :5.21
plot(y ~ x, data = xydata)
lm.2 <- lm(y ~ x, data = xydata)
abline(lm.2, lty = 2, lwd = 2, col = "blue")
# is evidence that there is not a direct linear relationship, with lower
# values of x below the regression line, and higher values of x, also
# below the line, surgesting a curvilinear relationship. We therefore try
# taking the natural logarithm of x then plot again
xydata$ln.x <- log(xydata$x)
plot(y ~ ln.x, data = xydata)
lm.3 <- lm(y ~ ln.x, data = xydata)
abline(lm.3, lwd = 2, lty = 2, col = "blue")
# this second model is much improved, the points falling much more nicely
# along the line.
The R session information (including the OS info, R version and all
packages used):
sessionInfo()
## R version 2.14.1 (2011-12-22)
## Platform: x86_64-pc-mingw32/x64 (64-bit)
##
## locale:
## [1] LC_COLLATE=English_United Kingdom.1252
## [2] LC_CTYPE=English_United Kingdom.1252
## [3] LC_MONETARY=English_United Kingdom.1252
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United Kingdom.1252
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] psych_1.2.8 knitr_0.8
##
## loaded via a namespace (and not attached):
## [1] digest_0.5.2 evaluate_0.4.2 formatR_0.6 plyr_1.7.1
## [5] stringr_0.6.1 tools_2.14.1
Sys.time()
## [1] "2012-10-22 15:00:07 CEST"