# By Luis George
# We show how to manipulate data in R
# learn to change name of variable in a column
# plot using lattice
# the data shows life expectancy by region
setwd("C:\\Users\\Luis\\Documents\\Making sense of data")
lifesp <-read.table("C:\\Users\\Luis\\Documents\\Making sense of data\\LifeExpRegion.txt",
sep="")
names(lifesp)
## [1] "V1" "V2" "V3"
attach(lifesp)
library(plyr)
lifesp <-rename(lifesp,c("V1"="country","V2"="lifespx","V3"="region"))
names(lifesp)
## [1] "country" "lifespx" "region"
lifesp[1:5,]
## country lifespx region
## 1 Afghanistan 48.673 SAs
## 2 Albania 76.918 EuCA
## 3 Algeria 73.131 MENA
## 4 Angola 51.093 SSA
## 5 Argentina 75.901 Amer
str(lifesp)
## 'data.frame': 197 obs. of 3 variables:
## $ country: Factor w/ 197 levels "Afghanistan",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ lifespx: num 48.7 76.9 73.1 51.1 75.9 ...
## $ region : Factor w/ 6 levels "Amer","EAP","EuCA",..: 5 3 4 6 1 3 1 2 3 3 ...
library(lattice)
histogram(~lifesp$lifespx|factor(region),data = lifesp)

densityplot(~lifesp$lifespx,data = lifesp,groups = region,
plot.points=FALSE,
auto.key = list(space="right",title="region"))

bwplot(lifesp$lifespx~lifesp$region,varwidth=TRUE,
ylab = "Life Expectancy",
xlab = "Region")

stripplot(lifespx~factor(region),lifesp,
jitter.data=TRUE,alpha=0.6,
ylab="Expectativa de Vida",
xlab="Region")

summary(lifesp)
## country lifespx region
## Afghanistan: 1 Min. :47.79 Amer:39
## Albania : 1 1st Qu.:64.67 EAP :30
## Algeria : 1 Median :73.23 EuCA:50
## Angola : 1 Mean :69.86 MENA:21
## Argentina : 1 3rd Qu.:76.65 SAs : 8
## Armenia : 1 Max. :83.39 SSA :49
## (Other) :191
t1 <- table(lifesp$region)
t1
##
## Amer EAP EuCA MENA SAs SSA
## 39 30 50 21 8 49
summary(lifesp$Amer)
## Length Class Mode
## 0 NULL NULL
str(lifesp)
## 'data.frame': 197 obs. of 3 variables:
## $ country: Factor w/ 197 levels "Afghanistan",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ lifespx: num 48.7 76.9 73.1 51.1 75.9 ...
## $ region : Factor w/ 6 levels "Amer","EAP","EuCA",..: 5 3 4 6 1 3 1 2 3 3 ...
freqRegion <-table(lifesp$region)
relFreqRegion <- table(lifesp$region)/197
cbind(freqRegion,relFreqRegion)
## freqRegion relFreqRegion
## Amer 39 0.19796954
## EAP 30 0.15228426
## EuCA 50 0.25380711
## MENA 21 0.10659898
## SAs 8 0.04060914
## SSA 49 0.24873096
##C:\Users\Luis\Documents\Making sense of data(video)
skeleto <- read.table("C:\\Users\\Luis\\Documents\\Making sense of data\\SkeletonData2.txt",
header = TRUE)
summary(skeleto)
## Observation Sex BMIcat BMIquant
## Min. : 1.0 Min. :1.000 normal :225 Min. :10.06
## 1st Qu.:100.8 1st Qu.:1.000 obese : 20 1st Qu.:19.41
## Median :200.5 Median :1.000 overweight : 81 Median :22.48
## Mean :200.5 Mean :1.298 underweight: 74 Mean :22.50
## 3rd Qu.:300.2 3rd Qu.:2.000 3rd Qu.:25.11
## Max. :400.0 Max. :2.000 Max. :40.03
## Age DGEstimate DGDifference
## Min. :19.00 Min. :12.0 Min. :-60.00
## 1st Qu.:40.00 1st Qu.:32.0 1st Qu.:-23.00
## Median :51.00 Median :32.0 Median :-13.00
## Mean :52.35 Mean :38.2 Mean :-14.15
## 3rd Qu.:63.25 3rd Qu.:44.0 3rd Qu.: -5.00
## Max. :85.00 Max. :66.0 Max. : 32.00
str(skeleto)
## 'data.frame': 400 obs. of 7 variables:
## $ Observation : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Sex : int 2 1 1 1 1 1 1 1 1 1 ...
## $ BMIcat : Factor w/ 4 levels "normal","obese",..: 4 1 3 3 1 4 3 4 1 1 ...
## $ BMIquant : num 15.7 23 27.9 27.8 21.4 ...
## $ Age : int 78 44 72 59 60 34 50 73 70 60 ...
## $ DGEstimate : int 44 32 32 44 32 25 32 50 39 44 ...
## $ DGDifference: int -34 -12 -40 -15 -28 -9 -18 -23 -31 -16 ...
summary(skeleto$BMIcat)
## normal obese overweight underweight
## 225 20 81 74
str(skeleto$BMIcat)
## Factor w/ 4 levels "normal","obese",..: 4 1 3 3 1 4 3 4 1 1 ...
skeleto$Sex <- as.factor(skeleto$Sex)
skeleto$Sex <-factor(skeleto$Sex,levels = c("1","2"),
labels = c("Male","Female"))
BMIcat <- factor(skeleto$BMIcat,levels = c("1","2",
"3","4"),labels = c("underweight",
"normal",
"overweight",
"obese"))
str(skeleto)
## 'data.frame': 400 obs. of 7 variables:
## $ Observation : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Sex : Factor w/ 2 levels "Male","Female": 2 1 1 1 1 1 1 1 1 1 ...
## $ BMIcat : Factor w/ 4 levels "normal","obese",..: 4 1 3 3 1 4 3 4 1 1 ...
## $ BMIquant : num 15.7 23 27.9 27.8 21.4 ...
## $ Age : int 78 44 72 59 60 34 50 73 70 60 ...
## $ DGEstimate : int 44 32 32 44 32 25 32 50 39 44 ...
## $ DGDifference: int -34 -12 -40 -15 -28 -9 -18 -23 -31 -16 ...
skeleto[1:5,]
## Observation Sex BMIcat BMIquant Age DGEstimate DGDifference
## 1 1 Female underweight 15.66 78 44 -34
## 2 2 Male normal 23.03 44 32 -12
## 3 3 Male overweight 27.92 72 32 -40
## 4 4 Male overweight 27.83 59 44 -15
## 5 5 Male normal 21.41 60 32 -28
summary(skeleto$Sex)
## Male Female
## 281 119
freqBMIcat <- table(skeleto$BMIcat)
relfrecBMIcat <- table(skeleto$BMIcat)/400
cbind(freqBMIcat,relfrecBMIcat)
## freqBMIcat relfrecBMIcat
## normal 225 0.5625
## obese 20 0.0500
## overweight 81 0.2025
## underweight 74 0.1850
freqSex <- table(skeleto$Sex)
relfreqSex <- table(skeleto$Sex)/400
cbind(freqSex,relfreqSex)
## freqSex relfreqSex
## Male 281 0.7025
## Female 119 0.2975
library(gmodels)
join <- CrossTable(skeleto$BMIcat,skeleto$Sex,
chisq = FALSE)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 400
##
##
## | skeleto$Sex
## skeleto$BMIcat | Male | Female | Row Total |
## ---------------|-----------|-----------|-----------|
## normal | 166 | 59 | 225 |
## | 0.399 | 0.941 | |
## | 0.738 | 0.262 | 0.562 |
## | 0.591 | 0.496 | |
## | 0.415 | 0.147 | |
## ---------------|-----------|-----------|-----------|
## obese | 10 | 10 | 20 |
## | 1.167 | 2.757 | |
## | 0.500 | 0.500 | 0.050 |
## | 0.036 | 0.084 | |
## | 0.025 | 0.025 | |
## ---------------|-----------|-----------|-----------|
## overweight | 59 | 22 | 81 |
## | 0.077 | 0.183 | |
## | 0.728 | 0.272 | 0.203 |
## | 0.210 | 0.185 | |
## | 0.147 | 0.055 | |
## ---------------|-----------|-----------|-----------|
## underweight | 46 | 28 | 74 |
## | 0.689 | 1.627 | |
## | 0.622 | 0.378 | 0.185 |
## | 0.164 | 0.235 | |
## | 0.115 | 0.070 | |
## ---------------|-----------|-----------|-----------|
## Column Total | 281 | 119 | 400 |
## | 0.703 | 0.297 | |
## ---------------|-----------|-----------|-----------|
##
##
join
## $t
## y
## x Male Female
## normal 166 59
## obese 10 10
## overweight 59 22
## underweight 46 28
##
## $prop.row
## y
## x Male Female
## normal 0.7377778 0.2622222
## obese 0.5000000 0.5000000
## overweight 0.7283951 0.2716049
## underweight 0.6216216 0.3783784
##
## $prop.col
## y
## x Male Female
## normal 0.59074733 0.49579832
## obese 0.03558719 0.08403361
## overweight 0.20996441 0.18487395
## underweight 0.16370107 0.23529412
##
## $prop.tbl
## y
## x Male Female
## normal 0.4150 0.1475
## obese 0.0250 0.0250
## overweight 0.1475 0.0550
## underweight 0.1150 0.0700
join.count <- join$t
barplot(join.count,beside = TRUE,col = rainbow(4),
ylab = "frequency",xlab = "Sex")
summary(skeleto$BMIcat)
## normal obese overweight underweight
## 225 20 81 74
legend("topright",c("underweight","normal","overweight","obese"),
pch=15,col=rainbow(4))
