library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(nycflights13)
data(flights)
head(flights)
str(flights)
## tibble [336,776 × 19] (S3: tbl_df/tbl/data.frame)
##  $ year          : int [1:336776] 2013 2013 2013 2013 2013 2013 2013 2013 2013 2013 ...
##  $ month         : int [1:336776] 1 1 1 1 1 1 1 1 1 1 ...
##  $ day           : int [1:336776] 1 1 1 1 1 1 1 1 1 1 ...
##  $ dep_time      : int [1:336776] 517 533 542 544 554 554 555 557 557 558 ...
##  $ sched_dep_time: int [1:336776] 515 529 540 545 600 558 600 600 600 600 ...
##  $ dep_delay     : num [1:336776] 2 4 2 -1 -6 -4 -5 -3 -3 -2 ...
##  $ arr_time      : int [1:336776] 830 850 923 1004 812 740 913 709 838 753 ...
##  $ sched_arr_time: int [1:336776] 819 830 850 1022 837 728 854 723 846 745 ...
##  $ arr_delay     : num [1:336776] 11 20 33 -18 -25 12 19 -14 -8 8 ...
##  $ carrier       : chr [1:336776] "UA" "UA" "AA" "B6" ...
##  $ flight        : int [1:336776] 1545 1714 1141 725 461 1696 507 5708 79 301 ...
##  $ tailnum       : chr [1:336776] "N14228" "N24211" "N619AA" "N804JB" ...
##  $ origin        : chr [1:336776] "EWR" "LGA" "JFK" "JFK" ...
##  $ dest          : chr [1:336776] "IAH" "IAH" "MIA" "BQN" ...
##  $ air_time      : num [1:336776] 227 227 160 183 116 150 158 53 140 138 ...
##  $ distance      : num [1:336776] 1400 1416 1089 1576 762 ...
##  $ hour          : num [1:336776] 5 5 5 5 6 5 6 6 6 6 ...
##  $ minute        : num [1:336776] 15 29 40 45 0 58 0 0 0 0 ...
##  $ time_hour     : POSIXct[1:336776], format: "2013-01-01 05:00:00" "2013-01-01 05:00:00" ...
names(flights)
##  [1] "year"           "month"          "day"            "dep_time"      
##  [5] "sched_dep_time" "dep_delay"      "arr_time"       "sched_arr_time"
##  [9] "arr_delay"      "carrier"        "flight"         "tailnum"       
## [13] "origin"         "dest"           "air_time"       "distance"      
## [17] "hour"           "minute"         "time_hour"

creat new column in a count or group_by

attach(flights)
## The following object is masked from package:lubridate:
## 
##     origin
df<-flights%>%mutate(long_flights=(air_time>=6*60))
names(flights)
##  [1] "year"           "month"          "day"            "dep_time"      
##  [5] "sched_dep_time" "dep_delay"      "arr_time"       "sched_arr_time"
##  [9] "arr_delay"      "carrier"        "flight"         "tailnum"       
## [13] "origin"         "dest"           "air_time"       "distance"      
## [17] "hour"           "minute"         "time_hour"
names(df)
##  [1] "year"           "month"          "day"            "dep_time"      
##  [5] "sched_dep_time" "dep_delay"      "arr_time"       "sched_arr_time"
##  [9] "arr_delay"      "carrier"        "flight"         "tailnum"       
## [13] "origin"         "dest"           "air_time"       "distance"      
## [17] "hour"           "minute"         "time_hour"      "long_flights"
library(readxl)
m <- read_excel("C:/Users/USER/Desktop/m.xlsx")
summary(m)
##   customer_id       name            Product_id          Order_id        
##  Min.   :1.00   Length:24          Length:24          Length:24         
##  1st Qu.:1.75   Class :character   Class :character   Class :character  
##  Median :2.50   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :2.50                                                           
##  3rd Qu.:3.25                                                           
##  Max.   :4.00                                                           
##       Unit          Price           Total             DATE                    
##  Min.   :2.00   Min.   :123.0   Min.   : 400.0   Min.   :2020-01-03 00:00:00  
##  1st Qu.:2.75   1st Qu.:180.8   1st Qu.: 703.5   1st Qu.:2020-01-23 00:00:00  
##  Median :4.50   Median :329.5   Median : 907.5   Median :2020-05-18 00:00:00  
##  Mean   :5.00   Mean   :284.7   Mean   :1543.3   Mean   :2020-04-27 07:00:00  
##  3rd Qu.:6.75   3rd Qu.:377.2   3rd Qu.:1779.8   3rd Qu.:2020-07-12 00:00:00  
##  Max.   :9.00   Max.   :450.0   Max.   :4050.0   Max.   :2020-11-23 00:00:00
str(m)
## tibble [24 × 8] (S3: tbl_df/tbl/data.frame)
##  $ customer_id: num [1:24] 1 2 3 4 1 2 3 4 1 2 ...
##  $ name       : chr [1:24] "john" "peter" "james" "james" ...
##  $ Product_id : chr [1:24] "p1" "p2" "p3" "p1" ...
##  $ Order_id   : chr [1:24] "o1" "o2" "o3" "o4" ...
##  $ Unit       : num [1:24] 2 3 6 9 2 3 6 9 2 3 ...
##  $ Price      : num [1:24] 200 359 123 432 200 359 123 432 200 359 ...
##  $ Total      : num [1:24] 400 1077 738 3888 400 ...
##  $ DATE       : POSIXct[1:24], format: "2020-01-23" "2020-07-12" ...
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
c<-sqldf("select * from m")
c
library(lubridate)
f<-sqldf("select DATE,customer_id,Product_id,Order_id,Total,AVG(Total) over() as avg_total,AVG(Total) over(partition by DATE) as ave_by_day,AVG(Total) over() as avg_by_month from m")
f
d<-month(m$DATE,label = T)
d
##  [1] Jan Jul Jan Jul Jan Jul Nov Jul Jan Jul Mar Jul Jan Jul Jan Jul Jan Apr Jan
## [20] Jul Jan Jul Jan Jul
## 12 Levels: Jan < Feb < Mar < Apr < May < Jun < Jul < Aug < Sep < ... < Dec