W1_R_intro.R

alisonlo — Apr 9, 2014, 9:10 PM

# Market Intelligence Week 1 R introduction

## Basic Calculations
8*6
[1] 48
2^16
[1] 65536
2^

## Functions
sqrt(9)
[1] 8
abs(-65)
[1] 65
?sqrt

## Variables
SquareRoot9 = sqrt(9)
SquareRoot9
[1] 3
HoursYear <- 365*24
HoursYear
[1] 8760
ls()
[1] "HoursYear"   "SquareRoot9"

## Vectors
c(2,3,5,8,13)
[1]  2  3  5  8 13
Country = c("Brazil", "China", "India","Switzerland","USA")
LifeExpectancy = c(74,76,65,83,79)
Country
[1] "Brazil"      "China"       "India"       "Switzerland" "USA"        
LifeExpectancy
[1] 74 76 65 83 79
c("Brazil",74,"China",76)
[1] "Brazil" "74"     "China"  "76"    
Country[1]
[1] "Brazil"
LifeExpectancy[3]
[1] 65
Sequence = seq(1,100,2)
Sequence
 [1]  1  3  5  7  9 11 13 15 17 19 21 23 25 27 29 31 33 35 37 39 41 43 45
[24] 47 49 51 53 55 57 59 61 63 65 67 69 71 73 75 77 79 81 83 85 87 89 91
[47] 93 95 97 99

## Data Frames
Data = data.frame(Country, LifeExpectancy)
Data
      Country LifeExpectancy
1      Brazil             74
2       China             76
3       India             65
4 Switzerland             83
5         USA             79
Population = c(199000,1390000,1240000,7997,318000)
Data2 = cbind(Data,Population)
Data2
      Country LifeExpectancy Population
1      Brazil             74     199000
2       China             76    1390000
3       India             65    1240000
4 Switzerland             83       7997
5         USA             79     318000
Country = c("Australia","Greece")
LifeExpectancy = c(82,81)
Population = c(23050,11125)
NewData = data.frame(Country, LifeExpectancy, Population)
NewData
    Country LifeExpectancy Population
1 Australia             82      23050
2    Greece             81      11125
Data3 = rbind(Data2, NewData)
Data3
      Country LifeExpectancy Population
1      Brazil             74     199000
2       China             76    1390000
3       India             65    1240000
4 Switzerland             83       7997
5         USA             79     318000
6   Australia             82      23050
7      Greece             81      11125

# Loading csv files
setwd("~/Desktop/Manalytics/W1 intro R")
WHO = read.csv("WHO.csv")
str(WHO)
'data.frame':   194 obs. of  13 variables:
 $ Country                      : Factor w/ 194 levels "Afghanistan",..: 1 2 3 4 5 6 7 8 9 10 ...
 $ Region                       : Factor w/ 6 levels "Africa","Americas",..: 3 4 1 4 1 2 2 4 6 4 ...
 $ Population                   : int  29825 3162 38482 78 20821 89 41087 2969 23050 8464 ...
 $ Under15                      : num  47.4 21.3 27.4 15.2 47.6 ...
 $ Over60                       : num  3.82 14.93 7.17 22.86 3.84 ...
 $ FertilityRate                : num  5.4 1.75 2.83 NA 6.1 2.12 2.2 1.74 1.89 1.44 ...
 $ LifeExpectancy               : int  60 74 73 82 51 75 76 71 82 81 ...
 $ ChildMortality               : num  98.5 16.7 20 3.2 163.5 ...
 $ CellularSubscribers          : num  54.3 96.4 99 75.5 48.4 ...
 $ LiteracyRate                 : num  NA NA NA NA 70.1 99 97.8 99.6 NA NA ...
 $ GNI                          : num  1140 8820 8310 NA 5230 ...
 $ PrimarySchoolEnrollmentMale  : num  NA NA 98.2 78.4 93.1 91.1 NA NA 96.9 NA ...
 $ PrimarySchoolEnrollmentFemale: num  NA NA 96.4 79.4 78.2 84.5 NA NA 97.5 NA ...
summary(WHO)
                Country                      Region     Population     
 Afghanistan        :  1   Africa               :46   Min.   :      1  
 Albania            :  1   Americas             :35   1st Qu.:   1696  
 Algeria            :  1   Eastern Mediterranean:22   Median :   7790  
 Andorra            :  1   Europe               :53   Mean   :  36360  
 Angola             :  1   South-East Asia      :11   3rd Qu.:  24535  
 Antigua and Barbuda:  1   Western Pacific      :27   Max.   :1390000  
 (Other)            :188                                               
    Under15         Over60      FertilityRate  LifeExpectancy
 Min.   :13.1   Min.   : 0.81   Min.   :1.26   Min.   :47.0  
 1st Qu.:18.7   1st Qu.: 5.20   1st Qu.:1.83   1st Qu.:64.0  
 Median :28.6   Median : 8.53   Median :2.40   Median :72.5  
 Mean   :28.7   Mean   :11.16   Mean   :2.94   Mean   :70.0  
 3rd Qu.:37.8   3rd Qu.:16.69   3rd Qu.:3.90   3rd Qu.:76.0  
 Max.   :50.0   Max.   :31.92   Max.   :7.58   Max.   :83.0  
                                NA's   :11                   
 ChildMortality   CellularSubscribers  LiteracyRate       GNI       
 Min.   :  2.20   Min.   :  2.57      Min.   :31.1   Min.   :  340  
 1st Qu.:  8.43   1st Qu.: 63.57      1st Qu.:71.6   1st Qu.: 2335  
 Median : 18.60   Median : 97.75      Median :91.8   Median : 7870  
 Mean   : 36.15   Mean   : 93.64      Mean   :83.7   Mean   :13321  
 3rd Qu.: 55.98   3rd Qu.:120.81      3rd Qu.:97.8   3rd Qu.:17558  
 Max.   :181.60   Max.   :196.41      Max.   :99.8   Max.   :86440  
                  NA's   :10          NA's   :91     NA's   :32     
 PrimarySchoolEnrollmentMale PrimarySchoolEnrollmentFemale
 Min.   : 37.2               Min.   : 32.5                
 1st Qu.: 87.7               1st Qu.: 87.3                
 Median : 94.7               Median : 95.1                
 Mean   : 90.8               Mean   : 89.6                
 3rd Qu.: 98.1               3rd Qu.: 97.9                
 Max.   :100.0               Max.   :100.0                
 NA's   :93                  NA's   :93                   

## Subsetting
WHO_Europe = subset(WHO, Region == "Europe")
str(WHO_Europe)
'data.frame':   53 obs. of  13 variables:
 $ Country                      : Factor w/ 194 levels "Afghanistan",..: 2 4 8 10 11 16 17 22 26 43 ...
 $ Region                       : Factor w/ 6 levels "Africa","Americas",..: 4 4 4 4 4 4 4 4 4 4 ...
 $ Population                   : int  3162 78 2969 8464 9309 9405 11060 3834 7278 4307 ...
 $ Under15                      : num  21.3 15.2 20.3 14.5 22.2 ...
 $ Over60                       : num  14.93 22.86 14.06 23.52 8.24 ...
 $ FertilityRate                : num  1.75 NA 1.74 1.44 1.96 1.47 1.85 1.26 1.51 1.48 ...
 $ LifeExpectancy               : int  74 82 71 81 71 71 80 76 74 77 ...
 $ ChildMortality               : num  16.7 3.2 16.4 4 35.2 5.2 4.2 6.7 12.1 4.7 ...
 $ CellularSubscribers          : num  96.4 75.5 103.6 154.8 108.8 ...
 $ LiteracyRate                 : num  NA NA 99.6 NA NA NA NA 97.9 NA 98.8 ...
 $ GNI                          : num  8820 NA 6100 42050 8960 ...
 $ PrimarySchoolEnrollmentMale  : num  NA 78.4 NA NA 85.3 NA 98.9 86.5 99.3 94.8 ...
 $ PrimarySchoolEnrollmentFemale: num  NA 79.4 NA NA 84.1 NA 99.2 88.4 99.7 97 ...

## Writing csv files
write.csv(WHO_Europe, "WHO_Europe.csv")

## Removing variables
rm(WHO_Europe)


# Basic data analysis 

mean(WHO$Under15)
[1] 28.73
sd(WHO$Under15)
[1] 10.53
summary(WHO$Under15)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   13.1    18.7    28.6    28.7    37.8    50.0 

which.min(WHO$Under15)
[1] 86
WHO$Country[86]
[1] Japan
194 Levels: Afghanistan Albania Algeria Andorra ... Zimbabwe

which.max(WHO$Under15)
[1] 124
WHO$Country[124]
[1] Niger
194 Levels: Afghanistan Albania Algeria Andorra ... Zimbabwe

sort(WHO$Under15)
  [1] 13.12 13.17 13.28 13.53 14.04 14.04 14.16 14.18 14.41 14.51 14.56
 [12] 14.57 14.60 14.62 14.79 14.91 14.92 14.98 14.98 15.00 15.05 15.10
 [23] 15.13 15.20 15.20 15.25 15.45 15.69 16.35 16.37 16.42 16.45 16.48
 [34] 16.52 16.58 16.71 16.88 16.89 17.16 17.21 17.46 17.54 17.62 17.66
 [45] 17.95 18.26 18.26 18.47 18.64 18.95 18.99 19.01 19.63 20.16 20.17
 [56] 20.26 20.34 20.71 20.73 21.33 21.38 21.54 21.62 21.64 21.95 21.98
 [67] 22.05 22.25 22.87 23.22 23.68 23.94 24.19 24.31 24.42 24.56 24.90
 [78] 25.15 25.28 25.46 25.70 25.75 25.96 25.96 25.96 26.00 26.65 26.96
 [89] 27.05 27.42 27.53 27.78 27.83 27.85 28.03 28.53 28.65 28.65 28.84
[100] 28.88 28.90 29.02 29.03 29.18 29.27 29.43 29.45 29.53 29.69 30.10
[111] 30.10 30.10 30.10 30.17 30.21 30.29 30.53 30.57 30.61 30.61 30.61
[122] 30.62 31.23 31.25 32.78 33.37 33.72 33.75 34.13 34.31 34.40 34.53
[133] 35.23 35.35 35.35 35.58 35.61 35.72 35.75 35.81 36.59 36.75 36.77
[144] 37.33 37.37 37.88 38.05 38.37 38.49 38.59 38.95 40.07 40.22 40.24
[155] 40.37 40.51 40.72 40.80 41.48 41.48 41.55 41.60 41.74 41.89 42.17
[166] 42.28 42.37 42.37 42.46 42.72 42.95 43.06 43.08 43.10 43.29 43.54
[177] 43.56 44.20 44.23 44.85 45.11 45.38 45.44 45.66 45.90 46.33 46.73
[188] 47.14 47.35 47.42 47.58 48.52 48.54 49.99


## Scatterplot
plot(WHO$GNI, WHO$FertilityRate)

plot of chunk unnamed-chunk-1


## Subsetting
Outliers = subset(WHO, GNI > 10000 & FertilityRate > 2.5) 
nrow(Outliers)
[1] 7
Outliers[c("Country","GNI","FertilityRate")]
              Country   GNI FertilityRate
23           Botswana 14550          2.71
56  Equatorial Guinea 25620          5.04
63              Gabon 13740          4.18
83             Israel 27110          2.92
88         Kazakhstan 11250          2.52
131            Panama 14510          2.52
150      Saudi Arabia 24700          2.76

## Histograms
hist(WHO$CellularSubscribers)

plot of chunk unnamed-chunk-1


## Boxplot
boxplot(WHO$LifeExpectancy ~ WHO$Region)

plot of chunk unnamed-chunk-1


boxplot(WHO$LifeExpectancy ~ WHO$Region, xlab = "Region", ylab = "Life Expectancy", main = "Life Expectancy of Countries by Region")

plot of chunk unnamed-chunk-1


## Summary Tables
table(WHO$Region)

               Africa              Americas Eastern Mediterranean 
                   46                    35                    22 
               Europe       South-East Asia       Western Pacific 
                   53                    11                    27 

tapply(WHO$Over60, WHO$Region, mean)
               Africa              Americas Eastern Mediterranean 
                5.221                10.944                 5.620 
               Europe       South-East Asia       Western Pacific 
               19.775                 8.769                10.163 
tapply(WHO$LiteracyRate, WHO$Region, min)
               Africa              Americas Eastern Mediterranean 
                   NA                    NA                    NA 
               Europe       South-East Asia       Western Pacific 
                   NA                    NA                    NA 
tapply(WHO$LiteracyRate, WHO$Region, min, na.rm=TRUE)
               Africa              Americas Eastern Mediterranean 
                 31.1                  75.2                  63.9 
               Europe       South-East Asia       Western Pacific 
                 95.2                  56.8                  60.6