exploratory_graphs2.R

IanMac — Feb 5, 2013, 4:42 PM

#Dataframe (iQ) with 350 group 1 members and 650 group 2 members with the following characteristics (variables):
#iQ scores (iQ): norm distribution
#Ages (ages): random 5 - 95
#Weights (kg): random 50 - 150 

iq1 <- rep(c('1'),each = 350) # group 1
set.seed(1)
iq1scores <- rnorm(350, 95, 20) # group 1 iQ (lower mean and dispersion)
iq1 <- cbind(iq1scores, iq1) # combined group 1

iq2 <- rep(c('2'),each = 650) # group 2
set.seed(2)
iq2scores <- rnorm(650, 105, 25) # group 2 iQ (higher mean and dispersion)
iq2 <- cbind(iq2scores, iq2) # combined group 2

iQ <- as.data.frame(rbind(iq1, iq2),stringsAsFactors=F)  # put both together in a data frame

colnames(iQ) <- c('iQ', 'group') # define column names
iQ$iQ <- as.numeric(iQ$iQ) # iQ values as numeric 
iQ_avg <- mean(iQ$iQ) #calc average iQ

set.seed(3)
ages <- sample(5:95, 1000, replace=T) #create some ages (intergers) for the indivduals
iQ <- cbind(iQ, ages) #add ages to the existing data frame
ages_avg <- mean(iQ$ages) #calc average age

set.seed(4)
kg <- runif(1000, 50, 150) #create some random weights (numbers) for the indivduals
iQ <- cbind(iQ, kg) #add weights to the existing data frame
kg_avg <- mean(iQ$kg) #calc average weight

##Dataframe now created, ready for plotting.
head(iQ)
      iQ group ages     kg
1  82.47     1   20 108.58
2  98.67     1   78  50.89
3  78.29     1   40  79.37
4 126.91     1   34  77.74
5 101.59     1   59 131.36
6  78.59     1   59  76.04
tail(iQ)
         iQ group ages     kg
995   92.95     2   16  93.14
996  124.32     2   23 138.00
997   75.27     2   20  82.30
998  114.30     2   52  58.29
999   81.40     2   48  78.04
1000  89.87     2   53  87.54

#scatterplots
plot(iQ$ages, iQ$iQ, pch=19, col="blue", cex=2) #scattered blue spots

plot of chunk unnamed-chunk-1

plot(iQ$ages, iQ$iQ, pch=19, col="blue", cex=0.5) #smaller dots (better for detail)

plot of chunk unnamed-chunk-1

plot(iQ$ages, iQ$iQ, pch=19, col=iQ$group, cex=0.5) #use group number (numeric) for colour

plot of chunk unnamed-chunk-1

library(Hmisc)
Loading required package: survival
Loading required package: splines
Hmisc library by Frank E Harrell Jr

Type library(help='Hmisc'), ?Overview, or ?Hmisc.Overview') to see overall
documentation.

NOTE:Hmisc no longer redefines [.factor to drop unused levels when
subsetting.  To get the old behavior of Hmisc type dropUnusedLevels().
Attaching package: 'Hmisc'
The following object(s) are masked from 'package:survival':

untangle.specials
The following object(s) are masked from 'package:base':

format.pval, round.POSIXt, trunc.POSIXt, units
plot(iQ$ages, iQ$iQ, pch=19, col=cut2(iQ$kg, g=3), cex=1) #colour dots by weight range (x3)

plot of chunk unnamed-chunk-1

plot(iQ$ages, iQ$iQ, pch=19, col=iQ$group, cex=kg/kg_avg*1) #size dots for relative kg weight
lines(iQ$ages, rep(iQ_avg, 1000),col="green",lwd=10) #add a line at avg iQ level
points(seq(5, 95,length=90),seq(180, 40,length=90),col="blue",pch=1, cex=1) #points for illustration

plot of chunk unnamed-chunk-1


#alternatives to see detail for a lot of points
plot(iQ$ages, iQ$iQ, pch=19, cex=2) #regular plot

plot of chunk unnamed-chunk-1

smoothScatter(iQ$ages, iQ$iQ, pch=19, cex=2) #smooth scatter plot
KernSmooth 2.23 loaded Copyright M. P. Wand 1997-2009

plot of chunk unnamed-chunk-1

library(hexbin) #load hexbin package
Loading required package: grid
Loading required package: lattice
plot(hexbin(iQ$ages, iQ$iQ, xbins=12, xlab='Age', ylab='iQ')) #variable shade based on count within hexagons

plot of chunk unnamed-chunk-1


#QQ plot
qqplot(iQ$ages, iQ$iQ, col='orange') #plots corresponding quantiles of x and y

plot of chunk unnamed-chunk-1


#Visually checking for NAs (missing data)
x <- rnorm(100)
y <- rnorm(100)
y[x<0]<-NA #make y value = NA when x is negative
boxplot(x ~ is.na(y), xlab='y value is NA?', col=c('blue', 'red')) #y NAs in red

plot of chunk unnamed-chunk-1