February 16, 2015
More than 5 billion people are calling, texting, tweeting and browsing on mobile phones worldwide.
Facebook, a social networking website, is home to 40 billion photos
In 2008, Google was processing 20,000 terabytes of data (20 petabytes) a day
YouTube users upload 48 hours of new video every minute of the day
According to Twitter’s own research in early 2012, it sees roughly 175 million tweets every day, and has more than 465 million accounts
Wal-Mart, a retail giant, handles more than 1M customer transactions every hour
R is free, open source
Rich collection of user contributed packages (libraries) used for data analysis and manipulations
Supported by open source community – more than 2M R users/developers around the world
Great graphics and data visualization
Can be integrated with other languages like C, C++, Java
Frequent releases (annual+bug fixes)/active development
3+4
## [1] 7
b <- 10*5/100 # b is an object. The output of the expression # is assigned to object b using an assignment operator. b # prints the value of b
## [1] 0.5
2^4
## [1] 16
sqrt(2)
## [1] 1.414214
log(200, base=2)
## [1] 7.643856
round(2.53335,2)
## [1] 2.53
exp(7.64)
## [1] 2079.744
# this is a comment x <- 1 # assign 1 to x using assignment operator x # print x
## [1] 1
y <- "Hello" y
## [1] "Hello"
R has useful functions to calculate probabilities from a known distribution or to plot the mass or density function
# X ~ Binomial (3, 1/2) x=0:50 px=dbinom(x, size=50, p=0.5) plot(x,px,type="h",col="red", main="Binomial(50,0.5)")
x <- seq(-2, 2, l=100) plot(x, dnorm(x), type = "l", main ="Probability Density Function of x", col="red") abline(v=0)
x <- seq(-2, 2, l=100) plot(x, pnorm(x), type = "l", main ="Cumulative Distribution Function of x", col="red") abline(v=0)
country = c("India", "United States", "Brazil", "China")
lifeexpectancy = c(65, 79, 74, 76)
country
## [1] "India" "United States" "Brazil" "China"
lifeexpectancy
## [1] 65 79 74 76
data <- data.frame(country, lifeexpectancy) data
## country lifeexpectancy ## 1 India 65 ## 2 United States 79 ## 3 Brazil 74 ## 4 China 76
population <- c(1240000, 318000, 199000, 1390000) data1 <- cbind(data, population) data1 # view the updated data frame object
## country lifeexpectancy population ## 1 India 65 1240000 ## 2 United States 79 318000 ## 3 Brazil 74 199000 ## 4 China 76 1390000
who = read.csv("WHO.csv")
str(who)
## 'data.frame': 194 obs. of 13 variables: ## $ Country : Factor w/ 194 levels "Afghanistan",..: 1 2 3 4 5 6 7 8 9 10 ... ## $ Region : Factor w/ 6 levels "Africa","Americas",..: 3 4 1 4 1 2 2 4 6 4 ... ## $ Population : int 29825 3162 38482 78 20821 89 41087 2969 23050 8464 ... ## $ Under15 : num 47.4 21.3 27.4 15.2 47.6 ... ## $ Over60 : num 3.82 14.93 7.17 22.86 3.84 ... ## $ FertilityRate : num 5.4 1.75 2.83 NA 6.1 2.12 2.2 1.74 1.89 1.44 ... ## $ LifeExpectancy : int 60 74 73 82 51 75 76 71 82 81 ... ## $ ChildMortality : num 98.5 16.7 20 3.2 163.5 ... ## $ CellularSubscribers : num 54.3 96.4 99 75.5 48.4 ... ## $ LiteracyRate : num NA NA NA NA 70.1 99 97.8 99.6 NA NA ... ## $ GNI : num 1140 8820 8310 NA 5230 ... ## $ PrimarySchoolEnrollmentMale : num NA NA 98.2 78.4 93.1 91.1 NA NA 96.9 NA ... ## $ PrimarySchoolEnrollmentFemale: num NA NA 96.4 79.4 78.2 84.5 NA NA 97.5 NA ...
summary(who)
## Country Region Population ## Afghanistan : 1 Africa :46 Min. : 1 ## Albania : 1 Americas :35 1st Qu.: 1696 ## Algeria : 1 Eastern Mediterranean:22 Median : 7790 ## Andorra : 1 Europe :53 Mean : 36360 ## Angola : 1 South-East Asia :11 3rd Qu.: 24535 ## Antigua and Barbuda: 1 Western Pacific :27 Max. :1390000 ## (Other) :188 ## Under15 Over60 FertilityRate LifeExpectancy ## Min. :13.12 Min. : 0.81 Min. :1.260 Min. :47.00 ## 1st Qu.:18.72 1st Qu.: 5.20 1st Qu.:1.835 1st Qu.:64.00 ## Median :28.65 Median : 8.53 Median :2.400 Median :72.50 ## Mean :28.73 Mean :11.16 Mean :2.941 Mean :70.01 ## 3rd Qu.:37.75 3rd Qu.:16.69 3rd Qu.:3.905 3rd Qu.:76.00 ## Max. :49.99 Max. :31.92 Max. :7.580 Max. :83.00 ## NA's :11 ## ChildMortality CellularSubscribers LiteracyRate GNI ## Min. : 2.200 Min. : 2.57 Min. :31.10 Min. : 340 ## 1st Qu.: 8.425 1st Qu.: 63.57 1st Qu.:71.60 1st Qu.: 2335 ## Median : 18.600 Median : 97.75 Median :91.80 Median : 7870 ## Mean : 36.149 Mean : 93.64 Mean :83.71 Mean :13321 ## 3rd Qu.: 55.975 3rd Qu.:120.81 3rd Qu.:97.85 3rd Qu.:17558 ## Max. :181.600 Max. :196.41 Max. :99.80 Max. :86440 ## NA's :10 NA's :91 NA's :32 ## PrimarySchoolEnrollmentMale PrimarySchoolEnrollmentFemale ## Min. : 37.20 Min. : 32.50 ## 1st Qu.: 87.70 1st Qu.: 87.30 ## Median : 94.70 Median : 95.10 ## Mean : 90.85 Mean : 89.63 ## 3rd Qu.: 98.10 3rd Qu.: 97.90 ## Max. :100.00 Max. :100.00 ## NA's :93 NA's :93
Identify country who has the min & max under 15 category population
summary(who$Under15)
## Min. 1st Qu. Median Mean 3rd Qu. Max. ## 13.12 18.72 28.65 28.73 37.75 49.99
which.min(who$Under15)
## [1] 86
who$Country[which.min(who$Under15)]
## [1] Japan ## 194 Levels: Afghanistan Albania Algeria Andorra ... Zimbabwe
who$Country[which.max(who$Under15)]
## [1] Niger ## 194 Levels: Afghanistan Albania Algeria Andorra ... Zimbabwe
plot(who$GNI, who$FertilityRate, col= "blue", xlab= "GNI", ylab="Fertility Rate", main="Scatter Plot - GNI vs Fertility Rate")
# mostly high income - low fertility rate OR low income - high fertility rate
outliers <- subset(who, GNI > 10000 & FertilityRate > 2.5)
# outliers # all variables
outliers[c("Country", "GNI","FertilityRate")]
## Country GNI FertilityRate ## 23 Botswana 14550 2.71 ## 56 Equatorial Guinea 25620 5.04 ## 63 Gabon 13740 4.18 ## 83 Israel 27110 2.92 ## 88 Kazakhstan 11250 2.52 ## 131 Panama 14510 2.52 ## 150 Saudi Arabia 24700 2.76
hist(who$CellularSubscribers, main="Histogram of cellular subscribers")
# most of countries have 50 - 150 cellular scuscriber per 100 population
boxplot(who$LifeExpectancy ~ who$Region, xlab="Region",
ylab="Life Expectancy", main="Life Expectancy by Region")
who$Country[who$LifeExpectancy<68 & who$Region=="Americas"]
## [1] Bolivia (Plurinational State of) Guyana ## [3] Haiti ## 194 Levels: Afghanistan Albania Algeria Andorra ... Zimbabwe
who$Country[who$LifeExpectancy<65 & who$Region=="Europe"]
## [1] Turkmenistan ## 194 Levels: Afghanistan Albania Algeria Andorra ... Zimbabwe
# Load our data:
mvt = read.csv("mvt.csv", stringsAsFactors=FALSE)
## 'data.frame': 191641 obs. of 5 variables: ## $ Date : POSIXlt, format: "2012-12-31 23:15:00" "2012-12-31 22:00:00" ... ## $ Latitude : num 41.8 41.9 42 41.8 41.8 ... ## $ Longitude: num -87.6 -87.7 -87.8 -87.7 -87.6 ... ## $ Weekday : chr "Monday" "Monday" "Monday" "Monday" ... ## $ Hour : int 23 22 22 22 21 20 20 20 19 18 ...
# Create a simple line plot - need the total number of crimes on each day of the week. We can get this information by creating a table: table(mvt$Weekday)
## ## Friday Monday Saturday Sunday Thursday Tuesday Wednesday ## 29284 27397 27118 26316 27319 26791 27416
# Save this table as a data frame: WeekdayCounts = as.data.frame(table(mvt$Weekday)) str(WeekdayCounts)
## 'data.frame': 7 obs. of 2 variables: ## $ Var1: Factor w/ 7 levels "Friday","Monday",..: 1 2 3 4 5 6 7 ## $ Freq: int 29284 27397 27118 26316 27319 26791 27416
# Load the ggplot2 library:
library(ggplot2)
WeekdayCounts$Var1 = factor(WeekdayCounts$Var1, ordered=TRUE, levels=c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday","Saturday"))
ggplot(WeekdayCounts, aes(x=Var1, y=Freq)) + geom_line(aes(group=1)) + xlab("Day of the Week") + ylab("Total Motor Vehicle Thefts")
# Create a counts table for the weekday and hour: table(mvt$Weekday, mvt$Hour)
## ## 0 1 2 3 4 5 6 7 8 9 10 11 ## Friday 1873 932 743 560 473 602 839 1203 1268 1286 938 822 ## Monday 1900 825 712 527 415 542 772 1123 1323 1235 971 737 ## Saturday 2050 1267 985 836 652 508 541 650 858 1039 946 789 ## Sunday 2028 1236 1019 838 607 461 478 483 615 864 884 787 ## Thursday 1856 816 696 508 400 534 799 1135 1298 1301 932 731 ## Tuesday 1691 777 603 464 414 520 845 1118 1175 1174 948 786 ## Wednesday 1814 790 619 469 396 561 862 1140 1329 1237 947 763 ## ## 12 13 14 15 16 17 18 19 20 21 22 23 ## Friday 1207 857 937 1140 1165 1318 1623 1652 1736 1881 2308 1921 ## Monday 1129 824 958 1059 1136 1252 1518 1503 1622 1815 2009 1490 ## Saturday 1204 767 963 1086 1055 1084 1348 1390 1570 1702 2078 1750 ## Sunday 1192 789 959 1037 1083 1160 1389 1342 1706 1696 2079 1584 ## Thursday 1093 752 831 1044 1131 1258 1510 1537 1668 1776 2134 1579 ## Tuesday 1108 762 908 1071 1090 1274 1553 1496 1696 1816 2044 1458 ## Wednesday 1225 804 863 1075 1076 1289 1580 1507 1718 1748 2093 1511
# Save this to a data frame: DayHourCounts = as.data.frame(table(mvt$Weekday, mvt$Hour)) str(DayHourCounts)
## 'data.frame': 168 obs. of 3 variables: ## $ Var1: Factor w/ 7 levels "Friday","Monday",..: 1 2 3 4 5 6 7 1 2 3 ... ## $ Var2: Factor w/ 24 levels "0","1","2","3",..: 1 1 1 1 1 1 1 2 2 2 ... ## $ Freq: int 1873 1900 2050 2028 1856 1691 1814 932 825 1267 ...
ggplot(DayHourCounts, aes(x=Hour, y=Freq)) + geom_line(aes(group=Var1, color=Var1), size=2, alpha=0.5)
ggplot(DayHourCounts, aes(x = Hour, y = Var1)) + geom_tile(aes(fill = Freq)) + scale_fill_gradient(name="Total MV Thefts", low="white", high="red") + theme(axis.title.y = element_blank())
murderMap <- read.csv("murderM.csv")
str(murderMap)
## 'data.frame': 15537 obs. of 13 variables: ## $ X : int 1 2 3 4 5 6 7 8 9 10 ... ## $ region : Factor w/ 49 levels "alabama","arizona",..: 1 1 1 1 1 1 1 1 1 1 ... ## $ long : num -87.5 -87.5 -87.5 -87.5 -87.6 ... ## $ lat : num 30.4 30.4 30.4 30.3 30.3 ... ## $ group : int 1 1 1 1 1 1 1 1 1 1 ... ## $ order : int 1 2 3 4 5 6 7 8 9 10 ... ## $ subregion : Factor w/ 16 levels "chesapeake","chincoteague",..: NA NA NA NA NA NA NA NA NA NA ... ## $ State : Factor w/ 49 levels "Alabama","Arizona",..: 1 1 1 1 1 1 1 1 1 1 ... ## $ Population : int 4779736 4779736 4779736 4779736 4779736 4779736 4779736 4779736 4779736 4779736 ... ## $ PopulationDensity: num 94.7 94.7 94.7 94.7 94.7 ... ## $ Murders : int 199 199 199 199 199 199 199 199 199 199 ... ## $ GunMurders : int 135 135 135 135 135 135 135 135 135 135 ... ## $ GunOwnership : num 0.517 0.517 0.517 0.517 0.517 0.517 0.517 0.517 0.517 0.517 ...
ggplot(murderMap, aes(x = long, y = lat, group = group, fill = GunOwnership))+geom_polygon(colour = "black") + scale_fill_gradient(low = "black", high = "green", guide = "legend")
str(mtcars)
## 'data.frame': 32 obs. of 11 variables: ## $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ... ## $ cyl : num 6 6 4 6 8 6 8 4 4 6 ... ## $ disp: num 160 160 108 258 360 ... ## $ hp : num 110 110 93 110 175 105 245 62 95 123 ... ## $ drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ... ## $ wt : num 2.62 2.88 2.32 3.21 3.44 ... ## $ qsec: num 16.5 17 18.6 19.4 17 ... ## $ vs : num 0 0 1 1 0 1 0 1 1 1 ... ## $ am : num 1 1 1 0 0 0 0 0 0 0 ... ## $ gear: num 4 4 4 3 3 3 3 4 4 4 ... ## $ carb: num 4 4 1 1 2 1 4 2 2 4 ...
summary(lm(mpg~am, data=mtcars))
## ## Call: ## lm(formula = mpg ~ am, data = mtcars) ## ## Residuals: ## Min 1Q Median 3Q Max ## -9.3923 -3.0923 -0.2974 3.2439 9.5077 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 17.147 1.125 15.247 1.13e-15 *** ## am 7.245 1.764 4.106 0.000285 *** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## Residual standard error: 4.902 on 30 degrees of freedom ## Multiple R-squared: 0.3598, Adjusted R-squared: 0.3385 ## F-statistic: 16.86 on 1 and 30 DF, p-value: 0.000285
summary(lm(mpg~., data=mtcars))
## ## Call: ## lm(formula = mpg ~ ., data = mtcars) ## ## Residuals: ## Min 1Q Median 3Q Max ## -3.4506 -1.6044 -0.1196 1.2193 4.6271 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 12.30337 18.71788 0.657 0.5181 ## cyl -0.11144 1.04502 -0.107 0.9161 ## disp 0.01334 0.01786 0.747 0.4635 ## hp -0.02148 0.02177 -0.987 0.3350 ## drat 0.78711 1.63537 0.481 0.6353 ## wt -3.71530 1.89441 -1.961 0.0633 . ## qsec 0.82104 0.73084 1.123 0.2739 ## vs 0.31776 2.10451 0.151 0.8814 ## am 2.52023 2.05665 1.225 0.2340 ## gear 0.65541 1.49326 0.439 0.6652 ## carb -0.19942 0.82875 -0.241 0.8122 ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## Residual standard error: 2.65 on 21 degrees of freedom ## Multiple R-squared: 0.869, Adjusted R-squared: 0.8066 ## F-statistic: 13.93 on 10 and 21 DF, p-value: 3.793e-07
summary(lm(mpg~am+wt+hp+qsec, data=mtcars))
## ## Call: ## lm(formula = mpg ~ am + wt + hp + qsec, data = mtcars) ## ## Residuals: ## Min 1Q Median 3Q Max ## -3.4975 -1.5902 -0.1122 1.1795 4.5404 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 17.44019 9.31887 1.871 0.07215 . ## am 2.92550 1.39715 2.094 0.04579 * ## wt -3.23810 0.88990 -3.639 0.00114 ** ## hp -0.01765 0.01415 -1.247 0.22309 ## qsec 0.81060 0.43887 1.847 0.07573 . ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## Residual standard error: 2.435 on 27 degrees of freedom ## Multiple R-squared: 0.8579, Adjusted R-squared: 0.8368 ## F-statistic: 40.74 on 4 and 27 DF, p-value: 4.589e-11
Google uses R to make online advertisement more effective.
Facebook uses R for exploratory data analysis, user behaviour analysis related to profile updates and profile pictures.
Twitter uses R for data visualization and semantic clustering.
Australia and New Zealand bank uses R for credit risk analysis.
Bank of America uses R for reporting.
Trulia, the real estate company uses R for predicting house prices.
New York Times uses R for data visulaization, election forecast, dialect quiz