# By Luis George
# We show how to manipulate data in R
# learn to change name of variable in a column
# plot using lattice
# the data shows life expectancy by region
setwd("C:\\Users\\Luis\\Documents\\Making sense of data")
lifesp <-read.table("C:\\Users\\Luis\\Documents\\Making sense of data\\LifeExpRegion.txt",
sep="")
names(lifesp)
## [1] "V1" "V2" "V3"
attach(lifesp)
library(plyr)
lifesp <-rename(lifesp,c("V1"="country","V2"="lifespx","V3"="region"))
names(lifesp)
## [1] "country" "lifespx" "region"
lifesp[1:5,]
##       country lifespx region
## 1 Afghanistan  48.673    SAs
## 2     Albania  76.918   EuCA
## 3     Algeria  73.131   MENA
## 4      Angola  51.093    SSA
## 5   Argentina  75.901   Amer
str(lifesp)
## 'data.frame':    197 obs. of  3 variables:
##  $ country: Factor w/ 197 levels "Afghanistan",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ lifespx: num  48.7 76.9 73.1 51.1 75.9 ...
##  $ region : Factor w/ 6 levels "Amer","EAP","EuCA",..: 5 3 4 6 1 3 1 2 3 3 ...
library(lattice)
histogram(~lifesp$lifespx|factor(region),data = lifesp)

densityplot(~lifesp$lifespx,data = lifesp,groups = region,
            plot.points=FALSE,
            auto.key = list(space="right",title="region"))

bwplot(lifesp$lifespx~lifesp$region,varwidth=TRUE,
       ylab = "Life Expectancy",
       xlab = "Region")

stripplot(lifespx~factor(region),lifesp,
          jitter.data=TRUE,alpha=0.6,
          ylab="Expectativa de Vida",
          xlab="Region")

summary(lifesp)
##         country       lifespx       region  
##  Afghanistan:  1   Min.   :47.79   Amer:39  
##  Albania    :  1   1st Qu.:64.67   EAP :30  
##  Algeria    :  1   Median :73.23   EuCA:50  
##  Angola     :  1   Mean   :69.86   MENA:21  
##  Argentina  :  1   3rd Qu.:76.65   SAs : 8  
##  Armenia    :  1   Max.   :83.39   SSA :49  
##  (Other)    :191
t1 <- table(lifesp$region)
t1
## 
## Amer  EAP EuCA MENA  SAs  SSA 
##   39   30   50   21    8   49
summary(lifesp$Amer)
## Length  Class   Mode 
##      0   NULL   NULL
str(lifesp)
## 'data.frame':    197 obs. of  3 variables:
##  $ country: Factor w/ 197 levels "Afghanistan",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ lifespx: num  48.7 76.9 73.1 51.1 75.9 ...
##  $ region : Factor w/ 6 levels "Amer","EAP","EuCA",..: 5 3 4 6 1 3 1 2 3 3 ...
freqRegion <-table(lifesp$region)
relFreqRegion <- table(lifesp$region)/197
cbind(freqRegion,relFreqRegion)
##      freqRegion relFreqRegion
## Amer         39    0.19796954
## EAP          30    0.15228426
## EuCA         50    0.25380711
## MENA         21    0.10659898
## SAs           8    0.04060914
## SSA          49    0.24873096
##C:\Users\Luis\Documents\Making sense of data(video)
skeleto <- read.table("C:\\Users\\Luis\\Documents\\Making sense of data\\SkeletonData2.txt",
                      header = TRUE)
summary(skeleto)
##   Observation         Sex                BMIcat       BMIquant    
##  Min.   :  1.0   Min.   :1.000   normal     :225   Min.   :10.06  
##  1st Qu.:100.8   1st Qu.:1.000   obese      : 20   1st Qu.:19.41  
##  Median :200.5   Median :1.000   overweight : 81   Median :22.48  
##  Mean   :200.5   Mean   :1.298   underweight: 74   Mean   :22.50  
##  3rd Qu.:300.2   3rd Qu.:2.000                     3rd Qu.:25.11  
##  Max.   :400.0   Max.   :2.000                     Max.   :40.03  
##       Age          DGEstimate    DGDifference   
##  Min.   :19.00   Min.   :12.0   Min.   :-60.00  
##  1st Qu.:40.00   1st Qu.:32.0   1st Qu.:-23.00  
##  Median :51.00   Median :32.0   Median :-13.00  
##  Mean   :52.35   Mean   :38.2   Mean   :-14.15  
##  3rd Qu.:63.25   3rd Qu.:44.0   3rd Qu.: -5.00  
##  Max.   :85.00   Max.   :66.0   Max.   : 32.00
str(skeleto)
## 'data.frame':    400 obs. of  7 variables:
##  $ Observation : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Sex         : int  2 1 1 1 1 1 1 1 1 1 ...
##  $ BMIcat      : Factor w/ 4 levels "normal","obese",..: 4 1 3 3 1 4 3 4 1 1 ...
##  $ BMIquant    : num  15.7 23 27.9 27.8 21.4 ...
##  $ Age         : int  78 44 72 59 60 34 50 73 70 60 ...
##  $ DGEstimate  : int  44 32 32 44 32 25 32 50 39 44 ...
##  $ DGDifference: int  -34 -12 -40 -15 -28 -9 -18 -23 -31 -16 ...
summary(skeleto$BMIcat)
##      normal       obese  overweight underweight 
##         225          20          81          74
str(skeleto$BMIcat)
##  Factor w/ 4 levels "normal","obese",..: 4 1 3 3 1 4 3 4 1 1 ...
skeleto$Sex <- as.factor(skeleto$Sex)
skeleto$Sex <-factor(skeleto$Sex,levels = c("1","2"),
                     labels = c("Male","Female"))
BMIcat <- factor(skeleto$BMIcat,levels = c("1","2",
                                   "3","4"),labels = c("underweight",
                                                       "normal",
                                                       "overweight",
                                                       "obese"))
str(skeleto)
## 'data.frame':    400 obs. of  7 variables:
##  $ Observation : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Sex         : Factor w/ 2 levels "Male","Female": 2 1 1 1 1 1 1 1 1 1 ...
##  $ BMIcat      : Factor w/ 4 levels "normal","obese",..: 4 1 3 3 1 4 3 4 1 1 ...
##  $ BMIquant    : num  15.7 23 27.9 27.8 21.4 ...
##  $ Age         : int  78 44 72 59 60 34 50 73 70 60 ...
##  $ DGEstimate  : int  44 32 32 44 32 25 32 50 39 44 ...
##  $ DGDifference: int  -34 -12 -40 -15 -28 -9 -18 -23 -31 -16 ...
skeleto[1:5,]
##   Observation    Sex      BMIcat BMIquant Age DGEstimate DGDifference
## 1           1 Female underweight    15.66  78         44          -34
## 2           2   Male      normal    23.03  44         32          -12
## 3           3   Male  overweight    27.92  72         32          -40
## 4           4   Male  overweight    27.83  59         44          -15
## 5           5   Male      normal    21.41  60         32          -28
summary(skeleto$Sex)
##   Male Female 
##    281    119
freqBMIcat <- table(skeleto$BMIcat)
relfrecBMIcat <- table(skeleto$BMIcat)/400
cbind(freqBMIcat,relfrecBMIcat)
##             freqBMIcat relfrecBMIcat
## normal             225        0.5625
## obese               20        0.0500
## overweight          81        0.2025
## underweight         74        0.1850
freqSex <- table(skeleto$Sex)
relfreqSex <- table(skeleto$Sex)/400
cbind(freqSex,relfreqSex)
##        freqSex relfreqSex
## Male       281     0.7025
## Female     119     0.2975
library(gmodels)
join <- CrossTable(skeleto$BMIcat,skeleto$Sex,
                   chisq = FALSE)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## | Chi-square contribution |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  400 
## 
##  
##                | skeleto$Sex 
## skeleto$BMIcat |      Male |    Female | Row Total | 
## ---------------|-----------|-----------|-----------|
##         normal |       166 |        59 |       225 | 
##                |     0.399 |     0.941 |           | 
##                |     0.738 |     0.262 |     0.562 | 
##                |     0.591 |     0.496 |           | 
##                |     0.415 |     0.147 |           | 
## ---------------|-----------|-----------|-----------|
##          obese |        10 |        10 |        20 | 
##                |     1.167 |     2.757 |           | 
##                |     0.500 |     0.500 |     0.050 | 
##                |     0.036 |     0.084 |           | 
##                |     0.025 |     0.025 |           | 
## ---------------|-----------|-----------|-----------|
##     overweight |        59 |        22 |        81 | 
##                |     0.077 |     0.183 |           | 
##                |     0.728 |     0.272 |     0.203 | 
##                |     0.210 |     0.185 |           | 
##                |     0.147 |     0.055 |           | 
## ---------------|-----------|-----------|-----------|
##    underweight |        46 |        28 |        74 | 
##                |     0.689 |     1.627 |           | 
##                |     0.622 |     0.378 |     0.185 | 
##                |     0.164 |     0.235 |           | 
##                |     0.115 |     0.070 |           | 
## ---------------|-----------|-----------|-----------|
##   Column Total |       281 |       119 |       400 | 
##                |     0.703 |     0.297 |           | 
## ---------------|-----------|-----------|-----------|
## 
## 
join
## $t
##              y
## x             Male Female
##   normal       166     59
##   obese         10     10
##   overweight    59     22
##   underweight   46     28
## 
## $prop.row
##              y
## x                  Male    Female
##   normal      0.7377778 0.2622222
##   obese       0.5000000 0.5000000
##   overweight  0.7283951 0.2716049
##   underweight 0.6216216 0.3783784
## 
## $prop.col
##              y
## x                   Male     Female
##   normal      0.59074733 0.49579832
##   obese       0.03558719 0.08403361
##   overweight  0.20996441 0.18487395
##   underweight 0.16370107 0.23529412
## 
## $prop.tbl
##              y
## x               Male Female
##   normal      0.4150 0.1475
##   obese       0.0250 0.0250
##   overweight  0.1475 0.0550
##   underweight 0.1150 0.0700
join.count <- join$t
barplot(join.count,beside = TRUE,col = rainbow(4),
        ylab = "frequency",xlab = "Sex")
summary(skeleto$BMIcat)
##      normal       obese  overweight underweight 
##         225          20          81          74
legend("topright",c("underweight","normal","overweight","obese"),
                    pch=15,col=rainbow(4))