lesson1.utf8.md

# Install R and RStudio - interface

rm(list = ls())  # clear all workspace variables
getwd()

## [1] "/Users/sangalp/WorkDocs/Learning/R/shiny_flightnetwork"

setwd("~/WorkDocs/Learning/R")
4+5

## [1] 9

# define variables
hours_per_day = 24
str(hours_per_day)

##  num 24

typeof(hours_per_day)

## [1] "double"

days_per_week = 7
hours_per_week = hours_per_day * days_per_week
hours_per_week

## [1] 168

# exercise: define variables for minutes/hr, sec/min, and find #sec in a day

# data structure: vector
day = c(1, 2, 3, 4, 5, 6, 7) 
day

## [1] 1 2 3 4 5 6 7

day = (1:7) #easier for a long series
day

## [1] 1 2 3 4 5 6 7

cumulative_hours = hours_per_day * day
cumulative_hours

## [1]  24  48  72  96 120 144 168

# data structure: list
# lists
x = list(1, 2, 3, 4, 5, 6, '7') # list a special vector that can contain different classes. try doing that with a vector and see what happens
x

## [[1]]
## [1] 1
## 
## [[2]]
## [1] 2
## 
## [[3]]
## [1] 3
## 
## [[4]]
## [1] 4
## 
## [[5]]
## [1] 5
## 
## [[6]]
## [1] 6
## 
## [[7]]
## [1] "7"

x = list(1:10, '1', c(TRUE,FALSE)) # can have vector itself as an element.
x

## [[1]]
##  [1]  1  2  3  4  5  6  7  8  9 10
## 
## [[2]]
## [1] "1"
## 
## [[3]]
## [1]  TRUE FALSE

# factors and tables
x = c(1,1,1,1,2,2,3,3,3,3,3,3,3,4,4,5) # categorize using levels
factor(x)

##  [1] 1 1 1 1 2 2 3 3 3 3 3 3 3 4 4 5
## Levels: 1 2 3 4 5

table(x)

## x
## 1 2 3 4 5 
## 4 2 7 2 1

# compare variables
x=y=1
x<y

## [1] FALSE

day1 = c(2, 3, 3, 5, 6, 7, 8) 
day2 = c(1, 2, 4, 4, 5, 6, 7)
day1 < day2

## [1] FALSE FALSE  TRUE FALSE FALSE FALSE FALSE

# data structure: data frame is two dim, vector is one dim
df = data.frame(age = c(10, 20, 45, 37), name = c("John", "Mary", "Jane", "Ana"))
df

##   age name
## 1  10 John
## 2  20 Mary
## 3  45 Jane
## 4  37  Ana

df$name

## [1] John Mary Jane Ana 
## Levels: Ana Jane John Mary

# exercise: create a data frame with 
# two columns: city, population and 
# five rows: fill imaginary values
df_city = data.frame(city = c('Budapest','Seattle','Anacortes','Gyor','Bellingham'), population = c(1500000,2000000, 16000, 150000, 150000))
df_city

##         city population
## 1   Budapest    1500000
## 2    Seattle    2000000
## 3  Anacortes      16000
## 4       Gyor     150000
## 5 Bellingham     150000

# matrix
x = 1:3; y=10:12; m=cbind(x,y);m; # matrix is a multi dimensional vector of the same class.

##      x  y
## [1,] 1 10
## [2,] 2 11
## [3,] 3 12

# data frames, used to store table like data. unlike matrix, it can have different classes per column. 

# functions
# intro
addnumbers = function (a,b)
{
  return (a+b)
}
addnumbers(2,3)

## [1] 5

# another
namefunction = function(firstname)
{
  length = nchar(firstname)
  print(paste0("Number of letters in my first name is: ",length))
  if (length >= 10) 
  {
    print("I have a long first name")
  }
}
namefunction("Jonathan")

## [1] "Number of letters in my first name is: 8"

namefunction("Alexandria")

## [1] "Number of letters in my first name is: 10"
## [1] "I have a long first name"

# exercise: create a function that takes a vector of numbers and returns their mean
meanfunction = function (nums) {
  mean(nums)
}
meanfunction(day)

## [1] 4

# reading files
(list.files(pattern = "csv"))

## [1] "130912_P5_3PE_details.csv" "test.csv"                 
## [3] "train.csv"

airports = read.csv("~/WorkDocs/Learning/R/shiny_flightnetwork/airports.csv")
head(airports) # for better head

##   Airport  Latitude Longitude                       Name         City
## 1     GKA -6.081689  145.3919                     Goroka       Goroka
## 2     MAG -5.207083  145.7887                     Madang       Madang
## 3     HGU -5.826789  144.2959                Mount Hagen  Mount Hagen
## 4     LAE -6.569828  146.7262                     Nadzab       Nadzab
## 5     POM -9.443383  147.2200 Port Moresby Jacksons Intl Port Moresby
## 6     WWK -3.583828  143.6692                 Wewak Intl        Wewak
##            Country
## 1 Papua New Guinea
## 2 Papua New Guinea
## 3 Papua New Guinea
## 4 Papua New Guinea
## 5 Papua New Guinea
## 6 Papua New Guinea

names(airports)

## [1] "Airport"   "Latitude"  "Longitude" "Name"      "City"      "Country"

length(airports)

## [1] 6

# data from packages
head(mtcars)

##                    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
## Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1

summary(mtcars)

##       mpg             cyl             disp             hp       
##  Min.   :10.40   Min.   :4.000   Min.   : 71.1   Min.   : 52.0  
##  1st Qu.:15.43   1st Qu.:4.000   1st Qu.:120.8   1st Qu.: 96.5  
##  Median :19.20   Median :6.000   Median :196.3   Median :123.0  
##  Mean   :20.09   Mean   :6.188   Mean   :230.7   Mean   :146.7  
##  3rd Qu.:22.80   3rd Qu.:8.000   3rd Qu.:326.0   3rd Qu.:180.0  
##  Max.   :33.90   Max.   :8.000   Max.   :472.0   Max.   :335.0  
##       drat             wt             qsec             vs        
##  Min.   :2.760   Min.   :1.513   Min.   :14.50   Min.   :0.0000  
##  1st Qu.:3.080   1st Qu.:2.581   1st Qu.:16.89   1st Qu.:0.0000  
##  Median :3.695   Median :3.325   Median :17.71   Median :0.0000  
##  Mean   :3.597   Mean   :3.217   Mean   :17.85   Mean   :0.4375  
##  3rd Qu.:3.920   3rd Qu.:3.610   3rd Qu.:18.90   3rd Qu.:1.0000  
##  Max.   :4.930   Max.   :5.424   Max.   :22.90   Max.   :1.0000  
##        am              gear            carb      
##  Min.   :0.0000   Min.   :3.000   Min.   :1.000  
##  1st Qu.:0.0000   1st Qu.:3.000   1st Qu.:2.000  
##  Median :0.0000   Median :4.000   Median :2.000  
##  Mean   :0.4062   Mean   :3.688   Mean   :2.812  
##  3rd Qu.:1.0000   3rd Qu.:4.000   3rd Qu.:4.000  
##  Max.   :1.0000   Max.   :5.000   Max.   :8.000

psych::describe(mtcars)

##      vars  n   mean     sd median trimmed    mad   min    max  range  skew
## mpg     1 32  20.09   6.03  19.20   19.70   5.41 10.40  33.90  23.50  0.61
## cyl     2 32   6.19   1.79   6.00    6.23   2.97  4.00   8.00   4.00 -0.17
## disp    3 32 230.72 123.94 196.30  222.52 140.48 71.10 472.00 400.90  0.38
## hp      4 32 146.69  68.56 123.00  141.19  77.10 52.00 335.00 283.00  0.73
## drat    5 32   3.60   0.53   3.70    3.58   0.70  2.76   4.93   2.17  0.27
## wt      6 32   3.22   0.98   3.33    3.15   0.77  1.51   5.42   3.91  0.42
## qsec    7 32  17.85   1.79  17.71   17.83   1.42 14.50  22.90   8.40  0.37
## vs      8 32   0.44   0.50   0.00    0.42   0.00  0.00   1.00   1.00  0.24
## am      9 32   0.41   0.50   0.00    0.38   0.00  0.00   1.00   1.00  0.36
## gear   10 32   3.69   0.74   4.00    3.62   1.48  3.00   5.00   2.00  0.53
## carb   11 32   2.81   1.62   2.00    2.65   1.48  1.00   8.00   7.00  1.05
##      kurtosis    se
## mpg     -0.37  1.07
## cyl     -1.76  0.32
## disp    -1.21 21.91
## hp      -0.14 12.12
## drat    -0.71  0.09
## wt      -0.02  0.17
## qsec     0.34  0.32
## vs      -2.00  0.09
## am      -1.92  0.09
## gear    -1.07  0.13
## carb     1.26  0.29

Hmisc::describe(mtcars)

## Registered S3 methods overwritten by 'ggplot2':
##   method         from 
##   [.quosures     rlang
##   c.quosures     rlang
##   print.quosures rlang

## mtcars 
## 
##  11  Variables      32  Observations
## ---------------------------------------------------------------------------
## mpg 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##       32        0       25    0.999    20.09    6.796    12.00    14.34 
##      .25      .50      .75      .90      .95 
##    15.43    19.20    22.80    30.09    31.30 
## 
## lowest : 10.4 13.3 14.3 14.7 15.0, highest: 26.0 27.3 30.4 32.4 33.9
## ---------------------------------------------------------------------------
## cyl 
##        n  missing distinct     Info     Mean      Gmd 
##       32        0        3    0.866    6.188    1.948 
##                             
## Value          4     6     8
## Frequency     11     7    14
## Proportion 0.344 0.219 0.438
## ---------------------------------------------------------------------------
## disp 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##       32        0       27    0.999    230.7    142.5    77.35    80.61 
##      .25      .50      .75      .90      .95 
##   120.83   196.30   326.00   396.00   449.00 
## 
## lowest :  71.1  75.7  78.7  79.0  95.1, highest: 360.0 400.0 440.0 460.0 472.0
## ---------------------------------------------------------------------------
## hp 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##       32        0       22    0.997    146.7    77.04    63.65    66.00 
##      .25      .50      .75      .90      .95 
##    96.50   123.00   180.00   243.50   253.55 
## 
## lowest :  52  62  65  66  91, highest: 215 230 245 264 335
## ---------------------------------------------------------------------------
## drat 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##       32        0       22    0.997    3.597   0.6099    2.853    3.007 
##      .25      .50      .75      .90      .95 
##    3.080    3.695    3.920    4.209    4.314 
## 
## lowest : 2.76 2.93 3.00 3.07 3.08, highest: 4.08 4.11 4.22 4.43 4.93
## ---------------------------------------------------------------------------
## wt 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##       32        0       29    0.999    3.217    1.089    1.736    1.956 
##      .25      .50      .75      .90      .95 
##    2.581    3.325    3.610    4.048    5.293 
## 
## lowest : 1.513 1.615 1.835 1.935 2.140, highest: 3.845 4.070 5.250 5.345 5.424
## ---------------------------------------------------------------------------
## qsec 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##       32        0       30        1    17.85    2.009    15.05    15.53 
##      .25      .50      .75      .90      .95 
##    16.89    17.71    18.90    19.99    20.10 
## 
## lowest : 14.50 14.60 15.41 15.50 15.84, highest: 19.90 20.00 20.01 20.22 22.90
## ---------------------------------------------------------------------------
## vs 
##        n  missing distinct     Info      Sum     Mean      Gmd 
##       32        0        2    0.739       14   0.4375   0.5081 
## 
## ---------------------------------------------------------------------------
## am 
##        n  missing distinct     Info      Sum     Mean      Gmd 
##       32        0        2    0.724       13   0.4062    0.498 
## 
## ---------------------------------------------------------------------------
## gear 
##        n  missing distinct     Info     Mean      Gmd 
##       32        0        3    0.841    3.688   0.7863 
##                             
## Value          3     4     5
## Frequency     15    12     5
## Proportion 0.469 0.375 0.156
## ---------------------------------------------------------------------------
## carb 
##        n  missing distinct     Info     Mean      Gmd 
##       32        0        6    0.929    2.812    1.718 
##                                               
## Value          1     2     3     4     6     8
## Frequency      7    10     3    10     1     1
## Proportion 0.219 0.312 0.094 0.312 0.031 0.031
## ---------------------------------------------------------------------------

# Plots
# Best practices from Tufte minimalistic design:
# consistent look n feel, label the data, better to show missing data than hide, 
# rely on reasoning of the header than overwhelm the user's cognitive reasoning
# Colors: use for categorical or discrete variables, not continuous.
# Colors: use those that are easily perceptible by the human eye.  
# Use shapes and angles
library(ggplot2)
head(diamonds)

## # A tibble: 6 x 10
##   carat cut       color clarity depth table price     x     y     z
##   <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23  Ideal     E     SI2      61.5    55   326  3.95  3.98  2.43
## 2 0.21  Premium   E     SI1      59.8    61   326  3.89  3.84  2.31
## 3 0.23  Good      E     VS1      56.9    65   327  4.05  4.07  2.31
## 4 0.290 Premium   I     VS2      62.4    58   334  4.2   4.23  2.63
## 5 0.31  Good      J     SI2      63.3    58   335  4.34  4.35  2.75
## 6 0.24  Very Good J     VVS2     62.8    57   336  3.94  3.96  2.48

ggplot(data=diamonds, aes(x=carat, y=price)) #xy coordinates

ggplot(data=diamonds, aes(x=carat, y=price)) + geom_point() #scatter plot

ggplot(data=diamonds, aes(x=carat)) + geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data=diamonds, aes(x=carat)) + geom_histogram(fill="steelblue") #plot with color

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(diamonds, aes(x = carat, y = price)) + geom_point() + scale_x_continuous("Diamond Carat") + scale_y_continuous("Diamond Price in USD") # xy custom labels

# explore many other plots on your own

# explore other Libraries: see ggplot2 and ggvis cheatsheets, googleVis demo
# WORLD MAP see http://rpubs.com/pmsangal/maps Choropleth simply uses colors, cartogram uses size of
# country

# dplyr for data wrangling
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

mtcars %>% select(mpg,cyl) # sql like

##                      mpg cyl
## Mazda RX4           21.0   6
## Mazda RX4 Wag       21.0   6
## Datsun 710          22.8   4
## Hornet 4 Drive      21.4   6
## Hornet Sportabout   18.7   8
## Valiant             18.1   6
## Duster 360          14.3   8
## Merc 240D           24.4   4
## Merc 230            22.8   4
## Merc 280            19.2   6
## Merc 280C           17.8   6
## Merc 450SE          16.4   8
## Merc 450SL          17.3   8
## Merc 450SLC         15.2   8
## Cadillac Fleetwood  10.4   8
## Lincoln Continental 10.4   8
## Chrysler Imperial   14.7   8
## Fiat 128            32.4   4
## Honda Civic         30.4   4
## Toyota Corolla      33.9   4
## Toyota Corona       21.5   4
## Dodge Challenger    15.5   8
## AMC Javelin         15.2   8
## Camaro Z28          13.3   8
## Pontiac Firebird    19.2   8
## Fiat X1-9           27.3   4
## Porsche 914-2       26.0   4
## Lotus Europa        30.4   4
## Ford Pantera L      15.8   8
## Ferrari Dino        19.7   6
## Maserati Bora       15.0   8
## Volvo 142E          21.4   4

mtcars %>% filter(cyl > 4)

##     mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## 1  21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4
## 2  21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4
## 3  21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1
## 4  18.7   8 360.0 175 3.15 3.440 17.02  0  0    3    2
## 5  18.1   6 225.0 105 2.76 3.460 20.22  1  0    3    1
## 6  14.3   8 360.0 245 3.21 3.570 15.84  0  0    3    4
## 7  19.2   6 167.6 123 3.92 3.440 18.30  1  0    4    4
## 8  17.8   6 167.6 123 3.92 3.440 18.90  1  0    4    4
## 9  16.4   8 275.8 180 3.07 4.070 17.40  0  0    3    3
## 10 17.3   8 275.8 180 3.07 3.730 17.60  0  0    3    3
## 11 15.2   8 275.8 180 3.07 3.780 18.00  0  0    3    3
## 12 10.4   8 472.0 205 2.93 5.250 17.98  0  0    3    4
## 13 10.4   8 460.0 215 3.00 5.424 17.82  0  0    3    4
## 14 14.7   8 440.0 230 3.23 5.345 17.42  0  0    3    4
## 15 15.5   8 318.0 150 2.76 3.520 16.87  0  0    3    2
## 16 15.2   8 304.0 150 3.15 3.435 17.30  0  0    3    2
## 17 13.3   8 350.0 245 3.73 3.840 15.41  0  0    3    4
## 18 19.2   8 400.0 175 3.08 3.845 17.05  0  0    3    2
## 19 15.8   8 351.0 264 4.22 3.170 14.50  0  1    5    4
## 20 19.7   6 145.0 175 3.62 2.770 15.50  0  1    5    6
## 21 15.0   8 301.0 335 3.54 3.570 14.60  0  1    5    8

mtcars %>% select(mpg,cyl) %>% filter(cyl > 4)

##     mpg cyl
## 1  21.0   6
## 2  21.0   6
## 3  21.4   6
## 4  18.7   8
## 5  18.1   6
## 6  14.3   8
## 7  19.2   6
## 8  17.8   6
## 9  16.4   8
## 10 17.3   8
## 11 15.2   8
## 12 10.4   8
## 13 10.4   8
## 14 14.7   8
## 15 15.5   8
## 16 15.2   8
## 17 13.3   8
## 18 19.2   8
## 19 15.8   8
## 20 19.7   6
## 21 15.0   8

unique(diamonds$cut)

## [1] Ideal     Premium   Good      Very Good Fair     
## Levels: Fair < Good < Very Good < Premium < Ideal

# refer to dplyr cheatsheet, several others available https://www.rstudio.com/resources/cheatsheets/ 
# exercise: sort mtcars by mpg then hp using arrange function

a = data.frame(x1=c("A","B","C"),x2=c("one","two","three"),stringsAsFactors = F)
b = data.frame(x1=c("A","B","D"),x2=c("uno","dos","cuatro"),stringsAsFactors = F)
left_join(a, b, by = "x1") # join matching rows from a to b

##   x1  x2.x x2.y
## 1  A   one  uno
## 2  B   two  dos
## 3  C three <NA>

right_join(a, b, by = "x1") # join matching rows from b to a

##   x1 x2.x   x2.y
## 1  A  one    uno
## 2  B  two    dos
## 3  D <NA> cuatro

inner_join(a, b, by = "x1") # join matching rows from both

##   x1 x2.x x2.y
## 1  A  one  uno
## 2  B  two  dos

full_join(a, b, by = "x1") # join all rows from both

##   x1  x2.x   x2.y
## 1  A   one    uno
## 2  B   two    dos
## 3  C three   <NA>
## 4  D  <NA> cuatro

union(a,b) # unique rows from all

##   x1     x2
## 1  A    one
## 2  B    two
## 3  C  three
## 4  A    uno
## 5  B    dos
## 6  D cuatro

lesson1.R

sangalp

2019-05-15