# Install R and RStudio - interface
rm(list = ls()) # clear all workspace variables
getwd()
## [1] "/Users/sangalp/WorkDocs/Learning/R/shiny_flightnetwork"
setwd("~/WorkDocs/Learning/R")
4+5
## [1] 9
# define variables
hours_per_day = 24
str(hours_per_day)
## num 24
typeof(hours_per_day)
## [1] "double"
days_per_week = 7
hours_per_week = hours_per_day * days_per_week
hours_per_week
## [1] 168
# exercise: define variables for minutes/hr, sec/min, and find #sec in a day
# data structure: vector
day = c(1, 2, 3, 4, 5, 6, 7)
day
## [1] 1 2 3 4 5 6 7
day = (1:7) #easier for a long series
day
## [1] 1 2 3 4 5 6 7
cumulative_hours = hours_per_day * day
cumulative_hours
## [1] 24 48 72 96 120 144 168
# data structure: list
# lists
x = list(1, 2, 3, 4, 5, 6, '7') # list a special vector that can contain different classes. try doing that with a vector and see what happens
x
## [[1]]
## [1] 1
##
## [[2]]
## [1] 2
##
## [[3]]
## [1] 3
##
## [[4]]
## [1] 4
##
## [[5]]
## [1] 5
##
## [[6]]
## [1] 6
##
## [[7]]
## [1] "7"
x = list(1:10, '1', c(TRUE,FALSE)) # can have vector itself as an element.
x
## [[1]]
## [1] 1 2 3 4 5 6 7 8 9 10
##
## [[2]]
## [1] "1"
##
## [[3]]
## [1] TRUE FALSE
# factors and tables
x = c(1,1,1,1,2,2,3,3,3,3,3,3,3,4,4,5) # categorize using levels
factor(x)
## [1] 1 1 1 1 2 2 3 3 3 3 3 3 3 4 4 5
## Levels: 1 2 3 4 5
table(x)
## x
## 1 2 3 4 5
## 4 2 7 2 1
# compare variables
x=y=1
x<y
## [1] FALSE
day1 = c(2, 3, 3, 5, 6, 7, 8)
day2 = c(1, 2, 4, 4, 5, 6, 7)
day1 < day2
## [1] FALSE FALSE TRUE FALSE FALSE FALSE FALSE
# data structure: data frame is two dim, vector is one dim
df = data.frame(age = c(10, 20, 45, 37), name = c("John", "Mary", "Jane", "Ana"))
df
## age name
## 1 10 John
## 2 20 Mary
## 3 45 Jane
## 4 37 Ana
df$name
## [1] John Mary Jane Ana
## Levels: Ana Jane John Mary
# exercise: create a data frame with
# two columns: city, population and
# five rows: fill imaginary values
df_city = data.frame(city = c('Budapest','Seattle','Anacortes','Gyor','Bellingham'), population = c(1500000,2000000, 16000, 150000, 150000))
df_city
## city population
## 1 Budapest 1500000
## 2 Seattle 2000000
## 3 Anacortes 16000
## 4 Gyor 150000
## 5 Bellingham 150000
# matrix
x = 1:3; y=10:12; m=cbind(x,y);m; # matrix is a multi dimensional vector of the same class.
## x y
## [1,] 1 10
## [2,] 2 11
## [3,] 3 12
# data frames, used to store table like data. unlike matrix, it can have different classes per column.
# functions
# intro
addnumbers = function (a,b)
{
return (a+b)
}
addnumbers(2,3)
## [1] 5
# another
namefunction = function(firstname)
{
length = nchar(firstname)
print(paste0("Number of letters in my first name is: ",length))
if (length >= 10)
{
print("I have a long first name")
}
}
namefunction("Jonathan")
## [1] "Number of letters in my first name is: 8"
namefunction("Alexandria")
## [1] "Number of letters in my first name is: 10"
## [1] "I have a long first name"
# exercise: create a function that takes a vector of numbers and returns their mean
meanfunction = function (nums) {
mean(nums)
}
meanfunction(day)
## [1] 4
# reading files
(list.files(pattern = "csv"))
## [1] "130912_P5_3PE_details.csv" "test.csv"
## [3] "train.csv"
airports = read.csv("~/WorkDocs/Learning/R/shiny_flightnetwork/airports.csv")
head(airports) # for better head
## Airport Latitude Longitude Name City
## 1 GKA -6.081689 145.3919 Goroka Goroka
## 2 MAG -5.207083 145.7887 Madang Madang
## 3 HGU -5.826789 144.2959 Mount Hagen Mount Hagen
## 4 LAE -6.569828 146.7262 Nadzab Nadzab
## 5 POM -9.443383 147.2200 Port Moresby Jacksons Intl Port Moresby
## 6 WWK -3.583828 143.6692 Wewak Intl Wewak
## Country
## 1 Papua New Guinea
## 2 Papua New Guinea
## 3 Papua New Guinea
## 4 Papua New Guinea
## 5 Papua New Guinea
## 6 Papua New Guinea
names(airports)
## [1] "Airport" "Latitude" "Longitude" "Name" "City" "Country"
length(airports)
## [1] 6
# data from packages
head(mtcars)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
summary(mtcars)
## mpg cyl disp hp
## Min. :10.40 Min. :4.000 Min. : 71.1 Min. : 52.0
## 1st Qu.:15.43 1st Qu.:4.000 1st Qu.:120.8 1st Qu.: 96.5
## Median :19.20 Median :6.000 Median :196.3 Median :123.0
## Mean :20.09 Mean :6.188 Mean :230.7 Mean :146.7
## 3rd Qu.:22.80 3rd Qu.:8.000 3rd Qu.:326.0 3rd Qu.:180.0
## Max. :33.90 Max. :8.000 Max. :472.0 Max. :335.0
## drat wt qsec vs
## Min. :2.760 Min. :1.513 Min. :14.50 Min. :0.0000
## 1st Qu.:3.080 1st Qu.:2.581 1st Qu.:16.89 1st Qu.:0.0000
## Median :3.695 Median :3.325 Median :17.71 Median :0.0000
## Mean :3.597 Mean :3.217 Mean :17.85 Mean :0.4375
## 3rd Qu.:3.920 3rd Qu.:3.610 3rd Qu.:18.90 3rd Qu.:1.0000
## Max. :4.930 Max. :5.424 Max. :22.90 Max. :1.0000
## am gear carb
## Min. :0.0000 Min. :3.000 Min. :1.000
## 1st Qu.:0.0000 1st Qu.:3.000 1st Qu.:2.000
## Median :0.0000 Median :4.000 Median :2.000
## Mean :0.4062 Mean :3.688 Mean :2.812
## 3rd Qu.:1.0000 3rd Qu.:4.000 3rd Qu.:4.000
## Max. :1.0000 Max. :5.000 Max. :8.000
psych::describe(mtcars)
## vars n mean sd median trimmed mad min max range skew
## mpg 1 32 20.09 6.03 19.20 19.70 5.41 10.40 33.90 23.50 0.61
## cyl 2 32 6.19 1.79 6.00 6.23 2.97 4.00 8.00 4.00 -0.17
## disp 3 32 230.72 123.94 196.30 222.52 140.48 71.10 472.00 400.90 0.38
## hp 4 32 146.69 68.56 123.00 141.19 77.10 52.00 335.00 283.00 0.73
## drat 5 32 3.60 0.53 3.70 3.58 0.70 2.76 4.93 2.17 0.27
## wt 6 32 3.22 0.98 3.33 3.15 0.77 1.51 5.42 3.91 0.42
## qsec 7 32 17.85 1.79 17.71 17.83 1.42 14.50 22.90 8.40 0.37
## vs 8 32 0.44 0.50 0.00 0.42 0.00 0.00 1.00 1.00 0.24
## am 9 32 0.41 0.50 0.00 0.38 0.00 0.00 1.00 1.00 0.36
## gear 10 32 3.69 0.74 4.00 3.62 1.48 3.00 5.00 2.00 0.53
## carb 11 32 2.81 1.62 2.00 2.65 1.48 1.00 8.00 7.00 1.05
## kurtosis se
## mpg -0.37 1.07
## cyl -1.76 0.32
## disp -1.21 21.91
## hp -0.14 12.12
## drat -0.71 0.09
## wt -0.02 0.17
## qsec 0.34 0.32
## vs -2.00 0.09
## am -1.92 0.09
## gear -1.07 0.13
## carb 1.26 0.29
Hmisc::describe(mtcars)
## Registered S3 methods overwritten by 'ggplot2':
## method from
## [.quosures rlang
## c.quosures rlang
## print.quosures rlang
## mtcars
##
## 11 Variables 32 Observations
## ---------------------------------------------------------------------------
## mpg
## n missing distinct Info Mean Gmd .05 .10
## 32 0 25 0.999 20.09 6.796 12.00 14.34
## .25 .50 .75 .90 .95
## 15.43 19.20 22.80 30.09 31.30
##
## lowest : 10.4 13.3 14.3 14.7 15.0, highest: 26.0 27.3 30.4 32.4 33.9
## ---------------------------------------------------------------------------
## cyl
## n missing distinct Info Mean Gmd
## 32 0 3 0.866 6.188 1.948
##
## Value 4 6 8
## Frequency 11 7 14
## Proportion 0.344 0.219 0.438
## ---------------------------------------------------------------------------
## disp
## n missing distinct Info Mean Gmd .05 .10
## 32 0 27 0.999 230.7 142.5 77.35 80.61
## .25 .50 .75 .90 .95
## 120.83 196.30 326.00 396.00 449.00
##
## lowest : 71.1 75.7 78.7 79.0 95.1, highest: 360.0 400.0 440.0 460.0 472.0
## ---------------------------------------------------------------------------
## hp
## n missing distinct Info Mean Gmd .05 .10
## 32 0 22 0.997 146.7 77.04 63.65 66.00
## .25 .50 .75 .90 .95
## 96.50 123.00 180.00 243.50 253.55
##
## lowest : 52 62 65 66 91, highest: 215 230 245 264 335
## ---------------------------------------------------------------------------
## drat
## n missing distinct Info Mean Gmd .05 .10
## 32 0 22 0.997 3.597 0.6099 2.853 3.007
## .25 .50 .75 .90 .95
## 3.080 3.695 3.920 4.209 4.314
##
## lowest : 2.76 2.93 3.00 3.07 3.08, highest: 4.08 4.11 4.22 4.43 4.93
## ---------------------------------------------------------------------------
## wt
## n missing distinct Info Mean Gmd .05 .10
## 32 0 29 0.999 3.217 1.089 1.736 1.956
## .25 .50 .75 .90 .95
## 2.581 3.325 3.610 4.048 5.293
##
## lowest : 1.513 1.615 1.835 1.935 2.140, highest: 3.845 4.070 5.250 5.345 5.424
## ---------------------------------------------------------------------------
## qsec
## n missing distinct Info Mean Gmd .05 .10
## 32 0 30 1 17.85 2.009 15.05 15.53
## .25 .50 .75 .90 .95
## 16.89 17.71 18.90 19.99 20.10
##
## lowest : 14.50 14.60 15.41 15.50 15.84, highest: 19.90 20.00 20.01 20.22 22.90
## ---------------------------------------------------------------------------
## vs
## n missing distinct Info Sum Mean Gmd
## 32 0 2 0.739 14 0.4375 0.5081
##
## ---------------------------------------------------------------------------
## am
## n missing distinct Info Sum Mean Gmd
## 32 0 2 0.724 13 0.4062 0.498
##
## ---------------------------------------------------------------------------
## gear
## n missing distinct Info Mean Gmd
## 32 0 3 0.841 3.688 0.7863
##
## Value 3 4 5
## Frequency 15 12 5
## Proportion 0.469 0.375 0.156
## ---------------------------------------------------------------------------
## carb
## n missing distinct Info Mean Gmd
## 32 0 6 0.929 2.812 1.718
##
## Value 1 2 3 4 6 8
## Frequency 7 10 3 10 1 1
## Proportion 0.219 0.312 0.094 0.312 0.031 0.031
## ---------------------------------------------------------------------------
# Plots
# Best practices from Tufte minimalistic design:
# consistent look n feel, label the data, better to show missing data than hide,
# rely on reasoning of the header than overwhelm the user's cognitive reasoning
# Colors: use for categorical or discrete variables, not continuous.
# Colors: use those that are easily perceptible by the human eye.
# Use shapes and angles
library(ggplot2)
head(diamonds)
## # A tibble: 6 x 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 4 0.290 Premium I VS2 62.4 58 334 4.2 4.23 2.63
## 5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
## 6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
ggplot(data=diamonds, aes(x=carat, y=price)) #xy coordinates

ggplot(data=diamonds, aes(x=carat, y=price)) + geom_point() #scatter plot

ggplot(data=diamonds, aes(x=carat)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data=diamonds, aes(x=carat)) + geom_histogram(fill="steelblue") #plot with color
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(diamonds, aes(x = carat, y = price)) + geom_point() + scale_x_continuous("Diamond Carat") + scale_y_continuous("Diamond Price in USD") # xy custom labels

# explore many other plots on your own
# explore other Libraries: see ggplot2 and ggvis cheatsheets, googleVis demo
# WORLD MAP see http://rpubs.com/pmsangal/maps Choropleth simply uses colors, cartogram uses size of
# country
# dplyr for data wrangling
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
mtcars %>% select(mpg,cyl) # sql like
## mpg cyl
## Mazda RX4 21.0 6
## Mazda RX4 Wag 21.0 6
## Datsun 710 22.8 4
## Hornet 4 Drive 21.4 6
## Hornet Sportabout 18.7 8
## Valiant 18.1 6
## Duster 360 14.3 8
## Merc 240D 24.4 4
## Merc 230 22.8 4
## Merc 280 19.2 6
## Merc 280C 17.8 6
## Merc 450SE 16.4 8
## Merc 450SL 17.3 8
## Merc 450SLC 15.2 8
## Cadillac Fleetwood 10.4 8
## Lincoln Continental 10.4 8
## Chrysler Imperial 14.7 8
## Fiat 128 32.4 4
## Honda Civic 30.4 4
## Toyota Corolla 33.9 4
## Toyota Corona 21.5 4
## Dodge Challenger 15.5 8
## AMC Javelin 15.2 8
## Camaro Z28 13.3 8
## Pontiac Firebird 19.2 8
## Fiat X1-9 27.3 4
## Porsche 914-2 26.0 4
## Lotus Europa 30.4 4
## Ford Pantera L 15.8 8
## Ferrari Dino 19.7 6
## Maserati Bora 15.0 8
## Volvo 142E 21.4 4
mtcars %>% filter(cyl > 4)
## mpg cyl disp hp drat wt qsec vs am gear carb
## 1 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
## 2 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
## 3 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
## 4 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
## 5 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
## 6 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
## 7 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
## 8 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4
## 9 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3
## 10 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3
## 11 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3
## 12 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4
## 13 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4
## 14 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4
## 15 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2
## 16 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2
## 17 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4
## 18 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2
## 19 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4
## 20 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
## 21 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8
mtcars %>% select(mpg,cyl) %>% filter(cyl > 4)
## mpg cyl
## 1 21.0 6
## 2 21.0 6
## 3 21.4 6
## 4 18.7 8
## 5 18.1 6
## 6 14.3 8
## 7 19.2 6
## 8 17.8 6
## 9 16.4 8
## 10 17.3 8
## 11 15.2 8
## 12 10.4 8
## 13 10.4 8
## 14 14.7 8
## 15 15.5 8
## 16 15.2 8
## 17 13.3 8
## 18 19.2 8
## 19 15.8 8
## 20 19.7 6
## 21 15.0 8
unique(diamonds$cut)
## [1] Ideal Premium Good Very Good Fair
## Levels: Fair < Good < Very Good < Premium < Ideal
# refer to dplyr cheatsheet, several others available https://www.rstudio.com/resources/cheatsheets/
# exercise: sort mtcars by mpg then hp using arrange function
a = data.frame(x1=c("A","B","C"),x2=c("one","two","three"),stringsAsFactors = F)
b = data.frame(x1=c("A","B","D"),x2=c("uno","dos","cuatro"),stringsAsFactors = F)
left_join(a, b, by = "x1") # join matching rows from a to b
## x1 x2.x x2.y
## 1 A one uno
## 2 B two dos
## 3 C three <NA>
right_join(a, b, by = "x1") # join matching rows from b to a
## x1 x2.x x2.y
## 1 A one uno
## 2 B two dos
## 3 D <NA> cuatro
inner_join(a, b, by = "x1") # join matching rows from both
## x1 x2.x x2.y
## 1 A one uno
## 2 B two dos
full_join(a, b, by = "x1") # join all rows from both
## x1 x2.x x2.y
## 1 A one uno
## 2 B two dos
## 3 C three <NA>
## 4 D <NA> cuatro
union(a,b) # unique rows from all
## x1 x2
## 1 A one
## 2 B two
## 3 C three
## 4 A uno
## 5 B dos
## 6 D cuatro