Market Intelligence Week 1 R introduction

Basic Calculations

2^16
## [1] 65536

Functions

abs(-65)

Variables

SquareRoot9 = sqrt(9)
SquareRoot9
## [1] 3
HoursYear <- 365 * 24
HoursYear
## [1] 8760
ls()
## [1] "HoursYear"   "SquareRoot9"

Vectors

c(2, 3, 5, 8, 13)
## [1]  2  3  5  8 13
Country = c("Brazil", "China", "India", "Switzerland", "USA")
LifeExpectancy = c(74, 76, 65, 83, 79)

Country[1]
## [1] "Brazil"
LifeExpectancy[3]
## [1] 65
Sequence = seq(1, 20, 2)
Sequence
##  [1]  1  3  5  7  9 11 13 15 17 19

Data Frames

Data = data.frame(Country, LifeExpectancy)
Data
##       Country LifeExpectancy
## 1      Brazil             74
## 2       China             76
## 3       India             65
## 4 Switzerland             83
## 5         USA             79
Population = c(199000, 1390000, 1240000, 7997, 318000)
Data2 = cbind(Data, Population)
Data2
##       Country LifeExpectancy Population
## 1      Brazil             74     199000
## 2       China             76    1390000
## 3       India             65    1240000
## 4 Switzerland             83       7997
## 5         USA             79     318000
Country = c("Australia", "Greece")
LifeExpectancy = c(82, 81)
Population = c(23050, 11125)
NewData = data.frame(Country, LifeExpectancy, Population)
NewData
##     Country LifeExpectancy Population
## 1 Australia             82      23050
## 2    Greece             81      11125
Data3 = rbind(Data2, NewData)
Data3
##       Country LifeExpectancy Population
## 1      Brazil             74     199000
## 2       China             76    1390000
## 3       India             65    1240000
## 4 Switzerland             83       7997
## 5         USA             79     318000
## 6   Australia             82      23050
## 7      Greece             81      11125

Reading data file

setwd("~/Desktop/Manalytics/W1 intro R")
WHO = read.csv("WHO.csv")
str(WHO)
## 'data.frame':    194 obs. of  13 variables:
##  $ Country                      : Factor w/ 194 levels "Afghanistan",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ Region                       : Factor w/ 6 levels "Africa","Americas",..: 3 4 1 4 1 2 2 4 6 4 ...
##  $ Population                   : int  29825 3162 38482 78 20821 89 41087 2969 23050 8464 ...
##  $ Under15                      : num  47.4 21.3 27.4 15.2 47.6 ...
##  $ Over60                       : num  3.82 14.93 7.17 22.86 3.84 ...
##  $ FertilityRate                : num  5.4 1.75 2.83 NA 6.1 2.12 2.2 1.74 1.89 1.44 ...
##  $ LifeExpectancy               : int  60 74 73 82 51 75 76 71 82 81 ...
##  $ ChildMortality               : num  98.5 16.7 20 3.2 163.5 ...
##  $ CellularSubscribers          : num  54.3 96.4 99 75.5 48.4 ...
##  $ LiteracyRate                 : num  NA NA NA NA 70.1 99 97.8 99.6 NA NA ...
##  $ GNI                          : num  1140 8820 8310 NA 5230 ...
##  $ PrimarySchoolEnrollmentMale  : num  NA NA 98.2 78.4 93.1 91.1 NA NA 96.9 NA ...
##  $ PrimarySchoolEnrollmentFemale: num  NA NA 96.4 79.4 78.2 84.5 NA NA 97.5 NA ...
summary(WHO[, 1:3])
##                 Country                      Region     Population     
##  Afghanistan        :  1   Africa               :46   Min.   :      1  
##  Albania            :  1   Americas             :35   1st Qu.:   1696  
##  Algeria            :  1   Eastern Mediterranean:22   Median :   7790  
##  Andorra            :  1   Europe               :53   Mean   :  36360  
##  Angola             :  1   South-East Asia      :11   3rd Qu.:  24535  
##  Antigua and Barbuda:  1   Western Pacific      :27   Max.   :1390000  
##  (Other)            :188

Show me the first few records (default = 6)

head(WHO, 2)
##       Country                Region Population Under15 Over60
## 1 Afghanistan Eastern Mediterranean      29825   47.42   3.82
## 2     Albania                Europe       3162   21.33  14.93
##   FertilityRate LifeExpectancy ChildMortality CellularSubscribers
## 1          5.40             60           98.5               54.26
## 2          1.75             74           16.7               96.39
##   LiteracyRate  GNI PrimarySchoolEnrollmentMale
## 1           NA 1140                          NA
## 2           NA 8820                          NA
##   PrimarySchoolEnrollmentFemale
## 1                            NA
## 2                            NA

Subsetting

WHO_Europe = subset(WHO, Region == "Europe")

Writing csv files

write.csv(WHO_Europe, "WHO_Europe.csv")

Removing variables from memory

rm(WHO_Europe)

Fundamental data analysis

mean(WHO$Under15)
## [1] 28.73
sd(WHO$Under15)
## [1] 10.53
summary(WHO$Under15)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    13.1    18.7    28.6    28.7    37.8    50.0

Which country has a lowest rate of population under 15 years old? (find the record with the minimum value on that variable)

which.min(WHO$Under15)
## [1] 86

the country name of that record

WHO$Country[86]
## [1] Japan
## 194 Levels: Afghanistan Albania Algeria Andorra ... Zimbabwe

Which country has a highest % of population under 15 years old? (find the record with the maximum value on that variable)

WHO$Country[which.max(WHO$Under15)]
## [1] Niger
## 194 Levels: Afghanistan Albania Algeria Andorra ... Zimbabwe

sort the records according to the % population under age 15 – show the lowest 4 values

head(sort(WHO$Under15), 4)
## [1] 13.12 13.17 13.28 13.53

the two countries with the highest % population under 15 (notice the minus sign)

tempdata = WHO[order(-WHO$Under15), ]
head(tempdata, 2)
##     Country Region Population Under15 Over60 FertilityRate LifeExpectancy
## 124   Niger Africa      17157   49.99   4.26          7.58             56
## 181  Uganda Africa      36346   48.54   3.72          6.06             56
##     ChildMortality CellularSubscribers LiteracyRate  GNI
## 124          113.5               29.52           NA  720
## 181           68.9               48.38         73.2 1310
##     PrimarySchoolEnrollmentMale PrimarySchoolEnrollmentFemale
## 124                        64.2                          52.0
## 181                        89.7                          92.3
rm(tempdata)

Scatterplot

plot(WHO$GNI, WHO$FertilityRate)

plot of chunk unnamed-chunk-18

Subsetting

Find the set of outlier countries with income > $10k and fertilityrate > 2.5

Outliers = subset(WHO, GNI > 10000 & FertilityRate > 2.5)

how many Outliers?

nrow(Outliers)
## [1] 7

Show the Country name, income and fertility rate of those outliers

Outliers[c("Country", "GNI", "FertilityRate")]
##               Country   GNI FertilityRate
## 23           Botswana 14550          2.71
## 56  Equatorial Guinea 25620          5.04
## 63              Gabon 13740          4.18
## 83             Israel 27110          2.92
## 88         Kazakhstan 11250          2.52
## 131            Panama 14510          2.52
## 150      Saudi Arabia 24700          2.76

Histograms

smaller diagram

hist(WHO$CellularSubscribers)

plot of chunk unnamed-chunk-22

Boxplot with labels and title

boxplot(WHO$LifeExpectancy ~ WHO$Region, xlab = "Region", ylab = "Life Expectancy", 
    main = "Life Expectancy of Countries by Region")

plot of chunk unnamed-chunk-23

Summary Tables

table(WHO$Region)
## 
##                Africa              Americas Eastern Mediterranean 
##                    46                    35                    22 
##                Europe       South-East Asia       Western Pacific 
##                    53                    11                    27

Compare the average % of population over 60 across Regions

tapply(WHO$Over60, WHO$Region, mean)
##                Africa              Americas Eastern Mediterranean 
##                 5.221                10.944                 5.620 
##                Europe       South-East Asia       Western Pacific 
##                19.775                 8.769                10.163

Compare the minimum Literacy rate across Regions

tapply(WHO$LiteracyRate, WHO$Region, min)
##                Africa              Americas Eastern Mediterranean 
##                    NA                    NA                    NA 
##                Europe       South-East Asia       Western Pacific 
##                    NA                    NA                    NA
tapply(WHO$LiteracyRate, WHO$Region, min, na.rm = TRUE)
##                Africa              Americas Eastern Mediterranean 
##                  31.1                  75.2                  63.9 
##                Europe       South-East Asia       Western Pacific 
##                  95.2                  56.8                  60.6