Sally Chen
8/25/2017
getwd() # get working directory## [1] "/Users/sallychen/Desktop/Introduction to R/intror"
summary(cars)## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
Click Import
Equivalent to the following code
library(readr)
movie_top5000 <- read_csv("~/Desktop/Introduction to R/intror/movie_top5000.csv")## Parsed with column specification:
## cols(
## Rank = col_integer(),
## imdb = col_character(),
## Name = col_character(),
## Year = col_integer(),
## Rating = col_double(),
## Length = col_integer(),
## Genres = col_character(),
## MPAA = col_character(),
## Release = col_character(),
## Gross_Box = col_double()
## )
class(movie_top5000) #if you are not sure, check the class## [1] "tbl_df" "tbl" "data.frame"
1+1;## [1] 2
log10(10);## [1] 1
exp(2);## [1] 7.389056
a = 1:100
m = matrix(a,nr = 5,nc = 4, byrow=TRUE);
print(m);## [,1] [,2] [,3] [,4]
## [1,] 1 2 3 4
## [2,] 5 6 7 8
## [3,] 9 10 11 12
## [4,] 13 14 15 16
## [5,] 17 18 19 20
apply(m,sum,MARGIN = 1);## [1] 10 26 42 58 74
colnames(movie_top5000);#see what columns a dataset has## [1] "Rank" "imdb" "Name" "Year" "Rating"
## [6] "Length" "Genres" "MPAA" "Release" "Gross_Box"
summary(movie_top5000);# see some summary statistics## Rank imdb Name Year
## Min. : 1 Length:5000 Length:5000 Min. :1995
## 1st Qu.:1251 Class :character Class :character 1st Qu.:2000
## Median :2500 Mode :character Mode :character Median :2005
## Mean :2500 Mean :2005
## 3rd Qu.:3750 3rd Qu.:2009
## Max. :5000 Max. :2014
##
## Rating Length Genres MPAA
## Min. :1.700 Min. : 45.0 Length:5000 Length:5000
## 1st Qu.:5.800 1st Qu.: 95.0 Class :character Class :character
## Median :6.500 Median :105.0 Mode :character Mode :character
## Mean :6.378 Mean :108.2
## 3rd Qu.:7.100 3rd Qu.:117.0
## Max. :9.000 Max. :383.0
## NA's :35
## Release Gross_Box
## Length:5000 Min. : 189000
## Class :character 1st Qu.: 1190000
## Mode :character Median : 11600000
## Mean : 34184385
## 3rd Qu.: 41300000
## Max. :761000000
##
plot(movie_top5000$Rating) # point plot of ratingplot(movie_top5000$Gross_Box) # point plot of gross boxcor(movie_top5000$Gross_Box,movie_top5000$Rating) # see the correlation## [1] 0.1315947
help("seq")
help("rnorm")# define a sum function
simplesum = function(a, b) {
s = a+b;
return(s);
}
simpledif = function(a, b) {
s = a-b;
return(s);
}
simpledif = function(a, b) {
s = a-b;
return(s);
}
Sally = function(){ # function could be very flexible with no arguments
a = rnorm(1)
if(a>0){
print("I'm really tired,see you tomorrow")
}
else{
print("Shall we have a cup of coffee?")
}
}simplesum(100,200)## [1] 300
Sally()## [1] "Shall we have a cup of coffee?"
Sally()## [1] "Shall we have a cup of coffee?"
movie_year<-table(movie_top5000$Year) # count how many movies in each year
movie_top5000_1<-movie_top5000[,c("Rank","Year","Rating","Length","Gross_Box")] # create a seperate movie dataset containning numeric values
movie_correlation <- cor(movie_top5000_1,use="complete.obs") # correlation analysismypackages = c("dplyr","data.table","lubridate","nycflights13","ggplot2")
install.packages(mypackages)library(dplyr)
library(ggplot2)
library(lubridate)
library(nycflights13)flights = flights
head(flights) # use head() to see the top rows of the data## # A tibble: 6 × 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 542 540 2 923
## 4 2013 1 1 544 545 -1 1004
## 5 2013 1 1 554 600 -6 812
## 6 2013 1 1 554 558 -4 740
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## # time_hour <dttm>
dim(flights) # use dim() to see the dimensions of the data## [1] 336776 19
filter(flights, month == 1, day == 1)## # A tibble: 842 × 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 542 540 2 923
## 4 2013 1 1 544 545 -1 1004
## 5 2013 1 1 554 600 -6 812
## 6 2013 1 1 554 558 -4 740
## 7 2013 1 1 555 600 -5 913
## 8 2013 1 1 557 600 -3 709
## 9 2013 1 1 557 600 -3 838
## 10 2013 1 1 558 600 -2 753
## # ... with 832 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
arrange(flights,dep_delay) # ascending order by default## # A tibble: 336,776 × 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 12 7 2040 2123 -43 40
## 2 2013 2 3 2022 2055 -33 2240
## 3 2013 11 10 1408 1440 -32 1549
## 4 2013 1 11 1900 1930 -30 2233
## 5 2013 1 29 1703 1730 -27 1947
## 6 2013 8 9 729 755 -26 1002
## 7 2013 10 23 1907 1932 -25 2143
## 8 2013 3 30 2030 2055 -25 2213
## 9 2013 3 2 1431 1455 -24 1601
## 10 2013 5 5 934 958 -24 1225
## # ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
arrange(flights,desc(dep_delay)) #use desc() to specify decending order## # A tibble: 336,776 × 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 9 641 900 1301 1242
## 2 2013 6 15 1432 1935 1137 1607
## 3 2013 1 10 1121 1635 1126 1239
## 4 2013 9 20 1139 1845 1014 1457
## 5 2013 7 22 845 1600 1005 1044
## 6 2013 4 10 1100 1900 960 1342
## 7 2013 3 17 2321 810 911 135
## 8 2013 6 27 959 1900 899 1236
## 9 2013 7 22 2257 759 898 121
## 10 2013 12 5 756 1700 896 1058
## # ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
arrange(flights,month,day,desc(dep_delay)) #ordering by multiple attributes## # A tibble: 336,776 × 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 848 1835 853 1001
## 2 2013 1 1 2343 1724 379 314
## 3 2013 1 1 1815 1325 290 2120
## 4 2013 1 1 2205 1720 285 46
## 5 2013 1 1 1842 1422 260 1958
## 6 2013 1 1 2115 1700 255 2330
## 7 2013 1 1 2006 1630 216 2230
## 8 2013 1 1 2312 2000 192 21
## 9 2013 1 1 1942 1705 157 2124
## 10 2013 1 1 1938 1703 155 2109
## # ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
select(flights, year, month, day, carrier)## # A tibble: 336,776 × 4
## year month day carrier
## <int> <int> <int> <chr>
## 1 2013 1 1 UA
## 2 2013 1 1 UA
## 3 2013 1 1 AA
## 4 2013 1 1 B6
## 5 2013 1 1 DL
## 6 2013 1 1 UA
## 7 2013 1 1 B6
## 8 2013 1 1 EV
## 9 2013 1 1 B6
## 10 2013 1 1 AA
## # ... with 336,766 more rows
flights<-mutate(flights,
speed = distance / air_time * 60
) # mutate keep the original columns
select(flights, speed)## # A tibble: 336,776 × 1
## speed
## <dbl>
## 1 370.0441
## 2 374.2731
## 3 408.3750
## 4 516.7213
## 5 394.1379
## 6 287.6000
## 7 404.4304
## 8 259.2453
## 9 404.5714
## 10 318.6957
## # ... with 336,766 more rows
origins<-group_by(flights, origin)
delay_origin_summary<-summarise(origins, # grouping flights by origins
avg_delay = mean(dep_delay,na.rm = TRUE), # na.rm = TRUE is set to remove NA values
flights = n(), # n() count the number of observations
sd_delay = sd(dep_delay,na.rm = TRUE),
max_delay = max(dep_delay,na.rm=TRUE)
)
delay_origin_summary## # A tibble: 3 × 5
## origin avg_delay flights sd_delay max_delay
## <chr> <dbl> <int> <dbl> <dbl>
## 1 EWR 15.10795 120835 41.32370 1126
## 2 JFK 12.11216 111279 39.03507 1301
## 3 LGA 10.34688 104662 39.99302 911
arrange(delay_origin_summary,desc(avg_delay)) # descending order by avg_delay time## # A tibble: 3 × 5
## origin avg_delay flights sd_delay max_delay
## <chr> <dbl> <int> <dbl> <dbl>
## 1 EWR 15.10795 120835 41.32370 1126
## 2 JFK 12.11216 111279 39.03507 1301
## 3 LGA 10.34688 104662 39.99302 911
now<-"2017-08-25 11:04:04"
class(now)## [1] "character"
now<-ymd_hms(now)
class(now)## [1] "POSIXct" "POSIXt"
print(now)## [1] "2017-08-25 11:04:04 UTC"
year(now)## [1] 2017
weekdays(now)## [1] "Friday"
flights$weekday<-weekdays(flights$time_hour)
table(flights$weekday)##
## Friday Monday Saturday Sunday Thursday Tuesday Wednesday
## 50308 50690 38720 46357 50219 50422 50060
g <- ggplot(flights) # plot the count of flights for each carrier
g + geom_bar(aes(carrier))g <- ggplot(flights) # plot the count of flights for each carrier and group by orgins
g + geom_bar(aes(carrier,fill=origin)) # group the flights by origin for each carrierg + geom_bar(aes(carrier,fill=origin)) + labs(title ="Flights by Carriers & Airport in New York City")ordered_movies<-arrange(movie_top5000,Rating,Gross_Box,Year,desc(Rating),desc(Gross_Box),Year)
years_grouped<-group_by(movie_top5000,Year)
year_movie_summary<-summarise(years_grouped,count = n(),avg_gross_box=mean(Gross_Box,na.rm=TRUE))
ggplot(year_movie_summary)+geom_line(aes(Year,avg_gross_box))