library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(nycflights13)
data(flights)
str(flights)
## tibble [336,776 × 19] (S3: tbl_df/tbl/data.frame)
## $ year : int [1:336776] 2013 2013 2013 2013 2013 2013 2013 2013 2013 2013 ...
## $ month : int [1:336776] 1 1 1 1 1 1 1 1 1 1 ...
## $ day : int [1:336776] 1 1 1 1 1 1 1 1 1 1 ...
## $ dep_time : int [1:336776] 517 533 542 544 554 554 555 557 557 558 ...
## $ sched_dep_time: int [1:336776] 515 529 540 545 600 558 600 600 600 600 ...
## $ dep_delay : num [1:336776] 2 4 2 -1 -6 -4 -5 -3 -3 -2 ...
## $ arr_time : int [1:336776] 830 850 923 1004 812 740 913 709 838 753 ...
## $ sched_arr_time: int [1:336776] 819 830 850 1022 837 728 854 723 846 745 ...
## $ arr_delay : num [1:336776] 11 20 33 -18 -25 12 19 -14 -8 8 ...
## $ carrier : chr [1:336776] "UA" "UA" "AA" "B6" ...
## $ flight : int [1:336776] 1545 1714 1141 725 461 1696 507 5708 79 301 ...
## $ tailnum : chr [1:336776] "N14228" "N24211" "N619AA" "N804JB" ...
## $ origin : chr [1:336776] "EWR" "LGA" "JFK" "JFK" ...
## $ dest : chr [1:336776] "IAH" "IAH" "MIA" "BQN" ...
## $ air_time : num [1:336776] 227 227 160 183 116 150 158 53 140 138 ...
## $ distance : num [1:336776] 1400 1416 1089 1576 762 ...
## $ hour : num [1:336776] 5 5 5 5 6 5 6 6 6 6 ...
## $ minute : num [1:336776] 15 29 40 45 0 58 0 0 0 0 ...
## $ time_hour : POSIXct[1:336776], format: "2013-01-01 05:00:00" "2013-01-01 05:00:00" ...
#p<-function(flights){sum(is.na(roller))/length(flights)*100}#missing value percentages
#apply(flights,2,p)#no missing values
#bar graphs
data<-flights%>%select(flight,carrier,time_hour)%>%mutate(month=month(time_hour,label=T))%>%ggplot(aes(month))+geom_bar(fill="blue",alpha=0.5)+coord_flip()+ggtitle("bargraph by month")+theme_minimal()
data
data<-flights%>%select(flight,carrier,time_hour)
data%>%mutate(hour=hour(time_hour))%>%group_by(hour)%>%tally()%>%ggplot(aes(x=hour,y=n))+geom_line(color="blue")#+Scale_x_continous(breaks=seq(0,24,4))
data
## # A tibble: 336,776 × 3
## flight carrier time_hour
## <int> <chr> <dttm>
## 1 1545 UA 2013-01-01 05:00:00
## 2 1714 UA 2013-01-01 05:00:00
## 3 1141 AA 2013-01-01 05:00:00
## 4 725 B6 2013-01-01 05:00:00
## 5 461 DL 2013-01-01 06:00:00
## 6 1696 UA 2013-01-01 05:00:00
## 7 507 B6 2013-01-01 06:00:00
## 8 5708 EV 2013-01-01 06:00:00
## 9 79 B6 2013-01-01 06:00:00
## 10 301 AA 2013-01-01 06:00:00
## # ℹ 336,766 more rows
string1<-as_date("2020-09-22")
string1
## [1] "2020-09-22"
string1<-as.Date("2020-09-22")
class(string1)
## [1] "Date"
string2<-as.POSIXct(string1)
class(string2)
## [1] "POSIXct" "POSIXt"
string2
## [1] "2020-09-22 UTC"
datetime<-as_datetime(string1)
class(datetime)
## [1] "POSIXct" "POSIXt"
dateformat1<-"20200922"
dateformat2<-"09-22-2020"
dateformat3<-"22/09/2020"
dateformat4<-"09-22-2020 17:00:00"
dateformat5<-"20200922 17 00 00"
class(class(datetime))
## [1] "character"
class(ymd(dateformat1))
## [1] "Date"
class(dmy(dateformat2))
## Warning: All formats failed to parse. No formats found.
## [1] "Date"
class(mdy_hms(dateformat4))
## [1] "POSIXct" "POSIXt"
class(ymd_hms((dateformat5)))
## [1] "POSIXct" "POSIXt"
#extract month
month(ymd(dateformat1),label = T)
## [1] Sep
## 12 Levels: Jan < Feb < Mar < Apr < May < Jun < Jul < Aug < Sep < ... < Dec
month(dmy(dateformat2),label=T)
## Warning: All formats failed to parse. No formats found.
## [1] <NA>
## 12 Levels: Jan < Feb < Mar < Apr < May < Jun < Jul < Aug < Sep < ... < Dec
month(mdy_hms(dateformat4),label=T)
## [1] Sep
## 12 Levels: Jan < Feb < Mar < Apr < May < Jun < Jul < Aug < Sep < ... < Dec
month(ymd_hms(dateformat5),label=T)
## [1] Sep
## 12 Levels: Jan < Feb < Mar < Apr < May < Jun < Jul < Aug < Sep < ... < Dec
#exttract day of teh week
wday(ymd(dateformat1),label=T)
## [1] Tue
## Levels: Sun < Mon < Tue < Wed < Thu < Fri < Sat
wday(ymd(dateformat2),label=T)
## Warning: All formats failed to parse. No formats found.
## [1] <NA>
## Levels: Sun < Mon < Tue < Wed < Thu < Fri < Sat
wday(mdy_hms(dateformat4),label=T)
## [1] Tue
## Levels: Sun < Mon < Tue < Wed < Thu < Fri < Sat
#DURATIONS
startdate<-as_datetime("2020-03-01 00:00:00")
enddate<-as_datetime("2020-03-31 23:59:59")
difftime<-enddate-startdate
as.duration(difftime)
## [1] "2678399s (~4.43 weeks)"
as.period(difftime)
## [1] "30d 23H 59M 59S"
as.interval(startdate,enddate)
## [1] 2020-03-01 UTC--2020-03-31 23:59:59 UTC
#interval((enddate,startdate)%|%months(1))
#add hours
startdate+minutes(23)
## [1] "2020-03-01 00:23:00 UTC"
startdate+hours(3)+minutes(10)+seconds(10)
## [1] "2020-03-01 03:10:10 UTC"
#set time
stTime=ymd_hms("2020-03-01",12:00:00,tz="America/New_york")
## Warning in 12:0:0: numerical expression has 13 elements: only the first used
## Warning in 12:0:0: All formats failed to parse. No formats found.
stTime+ddays(1)
## [1] NA NA NA NA NA NA NA NA NA NA NA NA NA NA
stTime+day(1)
## Warning: tz(): Don't know how to compute timezone for object of class numeric;
## returning "UTC".
## [1] NA NA NA NA NA NA NA NA NA NA NA NA NA NA
#my_time
my_time<-"10:40:25"
my_date_time_2<-as.POSIXct(paste(startdate,my_time))
my_date_time_2
## [1] "2020-03-01 10:40:25 PST"
my_date_time_2<-(format="%Y-%M-%D%H:%M:%S")
my_date_time_2
## [1] "%Y-%M-%D%H:%M:%S"
#add month
class(my_date<-as_date("2022-10-01"))
## [1] "Date"
my_date%m+%months(10)#add 10 months
## [1] "2023-08-01"
my_date%m-%months(30)#substract 30 months
## [1] "2020-04-01"
#my-date%m+%years(5)#add 5 years
#extract week
strftime(my_date,format ="%V")
## [1] "39"
lubridate::week(ymd(my_date))
## [1] 40
#extract month from yearmon
library(zoo)
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
class(my_date<-as.yearmon("Apr 2022,%b%Y"))
## [1] "yearmon"
class(format(my_date,"%y"))
## [1] "character"
format(my_date,"%b")
## [1] "Apr"
format(my_date,"%m")
## [1] "04"
class(as.numeric(format(my_date,"%m")))
## [1] "numeric"
year(my_date)
## [1] 2022
month(my_date)
## [1] 4
#factor to date
class(my_fac<-factor(c("2020-0-05","2022-01-07","2020-10-05")))
## [1] "factor"
class(as.Date(my_fac,format="%y-%m-%d"))#0rymd(my_fac)
## [1] "Date"
#number of months between dates
date_1<-as.Date("2020-08-10")
date_2<-as.Date("2025-01-01")
interval(date_1,date_2)%/%months(2)
## [1] 26
#agreggate
set.seed(123345)
data<-data.frame(date=sample(seq(as.Date("2020/01/01"),by="day",length.out=1000),100,replace=TRUE),value=round(rnorm(100,5,2),2))
data
## date value
## 1 2020-03-17 6.00
## 2 2020-12-24 5.08
## 3 2022-03-06 3.67
## 4 2020-08-15 2.73
## 5 2021-11-04 3.49
## 6 2020-05-30 7.21
## 7 2021-02-05 5.20
## 8 2021-06-23 7.62
## 9 2020-02-29 3.59
## 10 2022-08-02 10.22
## 11 2020-02-14 6.13
## 12 2021-01-24 5.65
## 13 2022-05-17 8.14
## 14 2020-04-06 7.30
## 15 2022-09-24 5.97
## 16 2020-09-19 2.24
## 17 2020-10-02 3.23
## 18 2020-04-11 3.58
## 19 2021-07-28 3.74
## 20 2020-04-25 6.18
## 21 2022-03-31 6.80
## 22 2021-06-06 2.58
## 23 2021-04-17 4.79
## 24 2021-08-05 4.43
## 25 2021-07-06 5.21
## 26 2022-02-07 7.35
## 27 2021-01-21 2.31
## 28 2022-08-16 4.57
## 29 2021-07-04 6.01
## 30 2021-07-14 7.19
## 31 2020-06-11 7.44
## 32 2021-04-22 3.28
## 33 2022-01-05 5.82
## 34 2020-11-18 6.28
## 35 2021-12-06 0.34
## 36 2020-03-09 5.14
## 37 2021-11-13 6.44
## 38 2020-06-17 4.21
## 39 2022-06-25 7.53
## 40 2021-01-16 2.41
## 41 2021-12-02 4.61
## 42 2021-07-30 1.86
## 43 2022-01-28 4.74
## 44 2021-05-28 5.85
## 45 2020-09-27 1.40
## 46 2022-08-08 4.35
## 47 2020-05-25 1.49
## 48 2021-12-31 2.37
## 49 2020-05-21 7.97
## 50 2022-08-09 7.04
## 51 2022-06-15 7.66
## 52 2021-06-22 4.28
## 53 2021-04-07 4.35
## 54 2020-10-26 4.67
## 55 2022-05-30 5.99
## 56 2020-03-13 4.16
## 57 2021-07-13 5.15
## 58 2020-11-17 2.99
## 59 2021-01-22 4.93
## 60 2021-12-31 4.52
## 61 2021-09-24 4.42
## 62 2020-12-23 3.20
## 63 2022-08-04 4.78
## 64 2021-08-08 8.84
## 65 2022-01-07 3.26
## 66 2022-09-10 5.38
## 67 2021-11-29 5.66
## 68 2020-12-22 4.29
## 69 2022-04-14 4.57
## 70 2021-11-03 1.89
## 71 2020-05-14 10.57
## 72 2021-08-30 5.72
## 73 2022-08-29 6.15
## 74 2021-07-04 7.65
## 75 2021-08-20 5.80
## 76 2020-02-21 5.83
## 77 2022-06-12 8.48
## 78 2022-02-01 6.02
## 79 2021-03-26 6.16
## 80 2021-02-23 5.96
## 81 2022-09-11 3.47
## 82 2020-10-05 5.20
## 83 2020-09-07 5.18
## 84 2021-09-20 6.42
## 85 2020-01-26 6.29
## 86 2020-11-23 8.00
## 87 2021-12-29 1.62
## 88 2020-02-03 5.45
## 89 2021-07-23 3.45
## 90 2021-12-06 6.72
## 91 2021-11-25 6.03
## 92 2021-03-12 6.37
## 93 2021-10-20 9.88
## 94 2022-05-22 3.75
## 95 2020-03-18 4.72
## 96 2021-10-03 3.42
## 97 2022-02-07 0.53
## 98 2021-06-05 3.16
## 99 2020-12-17 8.77
## 100 2020-06-14 6.66
shapiro.test(data$value)#the value variable is nornamally distributed
##
## Shapiro-Wilk normality test
##
## data: data$value
## W = 0.99337, p-value = 0.9096
data_new1<-data
head(data_new1$year<-strftime(data_new1$date,"%Y"))#extract year
## [1] "2020" "2020" "2022" "2020" "2021" "2020"
head(data_new1$month<-strftime(data_new1$date,"%m"))#extract month
## [1] "03" "12" "03" "08" "11" "05"
plot(data_new1$year,data_new1$value,type="l",color="blue",main="value vs year",xlab="year",ylab="value")
## Warning in plot.window(...): "color" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "color" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "color" is not a
## graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "color" is not a
## graphical parameter
## Warning in box(...): "color" is not a graphical parameter
## Warning in title(...): "color" is not a graphical parameter
head(data_new1$wday<-strftime(data_new1$date,"%wday"))#extract week days
## [1] "2day" "4day" "0day" "6day" "4day" "6day"
data_agg1<-aggregate(value~month+year,data=data_new1,FUN=sum)
data_agg1
## month year value
## 1 01 2020 6.29
## 2 02 2020 21.00
## 3 03 2020 20.02
## 4 04 2020 17.06
## 5 05 2020 27.24
## 6 06 2020 18.31
## 7 08 2020 2.73
## 8 09 2020 8.82
## 9 10 2020 13.10
## 10 11 2020 17.27
## 11 12 2020 21.34
## 12 01 2021 15.30
## 13 02 2021 11.16
## 14 03 2021 12.53
## 15 04 2021 12.42
## 16 05 2021 5.85
## 17 06 2021 17.64
## 18 07 2021 40.26
## 19 08 2021 24.79
## 20 09 2021 10.84
## 21 10 2021 13.30
## 22 11 2021 23.51
## 23 12 2021 20.18
## 24 01 2022 13.82
## 25 02 2022 13.90
## 26 03 2022 10.47
## 27 04 2022 4.57
## 28 05 2022 17.88
## 29 06 2022 23.67
## 30 08 2022 37.11
## 31 09 2022 14.82
data_new2<-data
data_new2$year_month<-floor_date(data_new2$date,"month")
data_new2
## date value year_month
## 1 2020-03-17 6.00 2020-03-01
## 2 2020-12-24 5.08 2020-12-01
## 3 2022-03-06 3.67 2022-03-01
## 4 2020-08-15 2.73 2020-08-01
## 5 2021-11-04 3.49 2021-11-01
## 6 2020-05-30 7.21 2020-05-01
## 7 2021-02-05 5.20 2021-02-01
## 8 2021-06-23 7.62 2021-06-01
## 9 2020-02-29 3.59 2020-02-01
## 10 2022-08-02 10.22 2022-08-01
## 11 2020-02-14 6.13 2020-02-01
## 12 2021-01-24 5.65 2021-01-01
## 13 2022-05-17 8.14 2022-05-01
## 14 2020-04-06 7.30 2020-04-01
## 15 2022-09-24 5.97 2022-09-01
## 16 2020-09-19 2.24 2020-09-01
## 17 2020-10-02 3.23 2020-10-01
## 18 2020-04-11 3.58 2020-04-01
## 19 2021-07-28 3.74 2021-07-01
## 20 2020-04-25 6.18 2020-04-01
## 21 2022-03-31 6.80 2022-03-01
## 22 2021-06-06 2.58 2021-06-01
## 23 2021-04-17 4.79 2021-04-01
## 24 2021-08-05 4.43 2021-08-01
## 25 2021-07-06 5.21 2021-07-01
## 26 2022-02-07 7.35 2022-02-01
## 27 2021-01-21 2.31 2021-01-01
## 28 2022-08-16 4.57 2022-08-01
## 29 2021-07-04 6.01 2021-07-01
## 30 2021-07-14 7.19 2021-07-01
## 31 2020-06-11 7.44 2020-06-01
## 32 2021-04-22 3.28 2021-04-01
## 33 2022-01-05 5.82 2022-01-01
## 34 2020-11-18 6.28 2020-11-01
## 35 2021-12-06 0.34 2021-12-01
## 36 2020-03-09 5.14 2020-03-01
## 37 2021-11-13 6.44 2021-11-01
## 38 2020-06-17 4.21 2020-06-01
## 39 2022-06-25 7.53 2022-06-01
## 40 2021-01-16 2.41 2021-01-01
## 41 2021-12-02 4.61 2021-12-01
## 42 2021-07-30 1.86 2021-07-01
## 43 2022-01-28 4.74 2022-01-01
## 44 2021-05-28 5.85 2021-05-01
## 45 2020-09-27 1.40 2020-09-01
## 46 2022-08-08 4.35 2022-08-01
## 47 2020-05-25 1.49 2020-05-01
## 48 2021-12-31 2.37 2021-12-01
## 49 2020-05-21 7.97 2020-05-01
## 50 2022-08-09 7.04 2022-08-01
## 51 2022-06-15 7.66 2022-06-01
## 52 2021-06-22 4.28 2021-06-01
## 53 2021-04-07 4.35 2021-04-01
## 54 2020-10-26 4.67 2020-10-01
## 55 2022-05-30 5.99 2022-05-01
## 56 2020-03-13 4.16 2020-03-01
## 57 2021-07-13 5.15 2021-07-01
## 58 2020-11-17 2.99 2020-11-01
## 59 2021-01-22 4.93 2021-01-01
## 60 2021-12-31 4.52 2021-12-01
## 61 2021-09-24 4.42 2021-09-01
## 62 2020-12-23 3.20 2020-12-01
## 63 2022-08-04 4.78 2022-08-01
## 64 2021-08-08 8.84 2021-08-01
## 65 2022-01-07 3.26 2022-01-01
## 66 2022-09-10 5.38 2022-09-01
## 67 2021-11-29 5.66 2021-11-01
## 68 2020-12-22 4.29 2020-12-01
## 69 2022-04-14 4.57 2022-04-01
## 70 2021-11-03 1.89 2021-11-01
## 71 2020-05-14 10.57 2020-05-01
## 72 2021-08-30 5.72 2021-08-01
## 73 2022-08-29 6.15 2022-08-01
## 74 2021-07-04 7.65 2021-07-01
## 75 2021-08-20 5.80 2021-08-01
## 76 2020-02-21 5.83 2020-02-01
## 77 2022-06-12 8.48 2022-06-01
## 78 2022-02-01 6.02 2022-02-01
## 79 2021-03-26 6.16 2021-03-01
## 80 2021-02-23 5.96 2021-02-01
## 81 2022-09-11 3.47 2022-09-01
## 82 2020-10-05 5.20 2020-10-01
## 83 2020-09-07 5.18 2020-09-01
## 84 2021-09-20 6.42 2021-09-01
## 85 2020-01-26 6.29 2020-01-01
## 86 2020-11-23 8.00 2020-11-01
## 87 2021-12-29 1.62 2021-12-01
## 88 2020-02-03 5.45 2020-02-01
## 89 2021-07-23 3.45 2021-07-01
## 90 2021-12-06 6.72 2021-12-01
## 91 2021-11-25 6.03 2021-11-01
## 92 2021-03-12 6.37 2021-03-01
## 93 2021-10-20 9.88 2021-10-01
## 94 2022-05-22 3.75 2022-05-01
## 95 2020-03-18 4.72 2020-03-01
## 96 2021-10-03 3.42 2021-10-01
## 97 2022-02-07 0.53 2022-02-01
## 98 2021-06-05 3.16 2021-06-01
## 99 2020-12-17 8.77 2020-12-01
## 100 2020-06-14 6.66 2020-06-01
library(dplyr)
data_aggr2<-data_new2%>%group_by(year_month)%>%summarise(value=sum(value))%>%as.data.frame()
data_aggr2
## year_month value
## 1 2020-01-01 6.29
## 2 2020-02-01 21.00
## 3 2020-03-01 20.02
## 4 2020-04-01 17.06
## 5 2020-05-01 27.24
## 6 2020-06-01 18.31
## 7 2020-08-01 2.73
## 8 2020-09-01 8.82
## 9 2020-10-01 13.10
## 10 2020-11-01 17.27
## 11 2020-12-01 21.34
## 12 2021-01-01 15.30
## 13 2021-02-01 11.16
## 14 2021-03-01 12.53
## 15 2021-04-01 12.42
## 16 2021-05-01 5.85
## 17 2021-06-01 17.64
## 18 2021-07-01 40.26
## 19 2021-08-01 24.79
## 20 2021-09-01 10.84
## 21 2021-10-01 13.30
## 22 2021-11-01 23.51
## 23 2021-12-01 20.18
## 24 2022-01-01 13.82
## 25 2022-02-01 13.90
## 26 2022-03-01 10.47
## 27 2022-04-01 4.57
## 28 2022-05-01 17.88
## 29 2022-06-01 23.67
## 30 2022-08-01 37.11
## 31 2022-09-01 14.82
data_aggr2$year<-strftime(data_aggr2$year_month,"%Y")
data_aggr2%>%group_by(data_aggr2$year)%>%summarise(value=sum(value))
## # A tibble: 3 × 2
## `data_aggr2$year` value
## <chr> <dbl>
## 1 2020 173.
## 2 2021 208.
## 3 2022 136.
plot(data_aggr2$year,data_aggr2$value,type="l")