library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(nycflights13)
data(flights)
head(flights)
str(flights)
## tibble [336,776 × 19] (S3: tbl_df/tbl/data.frame)
## $ year : int [1:336776] 2013 2013 2013 2013 2013 2013 2013 2013 2013 2013 ...
## $ month : int [1:336776] 1 1 1 1 1 1 1 1 1 1 ...
## $ day : int [1:336776] 1 1 1 1 1 1 1 1 1 1 ...
## $ dep_time : int [1:336776] 517 533 542 544 554 554 555 557 557 558 ...
## $ sched_dep_time: int [1:336776] 515 529 540 545 600 558 600 600 600 600 ...
## $ dep_delay : num [1:336776] 2 4 2 -1 -6 -4 -5 -3 -3 -2 ...
## $ arr_time : int [1:336776] 830 850 923 1004 812 740 913 709 838 753 ...
## $ sched_arr_time: int [1:336776] 819 830 850 1022 837 728 854 723 846 745 ...
## $ arr_delay : num [1:336776] 11 20 33 -18 -25 12 19 -14 -8 8 ...
## $ carrier : chr [1:336776] "UA" "UA" "AA" "B6" ...
## $ flight : int [1:336776] 1545 1714 1141 725 461 1696 507 5708 79 301 ...
## $ tailnum : chr [1:336776] "N14228" "N24211" "N619AA" "N804JB" ...
## $ origin : chr [1:336776] "EWR" "LGA" "JFK" "JFK" ...
## $ dest : chr [1:336776] "IAH" "IAH" "MIA" "BQN" ...
## $ air_time : num [1:336776] 227 227 160 183 116 150 158 53 140 138 ...
## $ distance : num [1:336776] 1400 1416 1089 1576 762 ...
## $ hour : num [1:336776] 5 5 5 5 6 5 6 6 6 6 ...
## $ minute : num [1:336776] 15 29 40 45 0 58 0 0 0 0 ...
## $ time_hour : POSIXct[1:336776], format: "2013-01-01 05:00:00" "2013-01-01 05:00:00" ...
names(flights)
## [1] "year" "month" "day" "dep_time"
## [5] "sched_dep_time" "dep_delay" "arr_time" "sched_arr_time"
## [9] "arr_delay" "carrier" "flight" "tailnum"
## [13] "origin" "dest" "air_time" "distance"
## [17] "hour" "minute" "time_hour"
creat new column in a count or group_by
attach(flights)
## The following object is masked from package:lubridate:
##
## origin
df<-flights%>%mutate(long_flights=(air_time>=6*60))
names(flights)
## [1] "year" "month" "day" "dep_time"
## [5] "sched_dep_time" "dep_delay" "arr_time" "sched_arr_time"
## [9] "arr_delay" "carrier" "flight" "tailnum"
## [13] "origin" "dest" "air_time" "distance"
## [17] "hour" "minute" "time_hour"
names(df)
## [1] "year" "month" "day" "dep_time"
## [5] "sched_dep_time" "dep_delay" "arr_time" "sched_arr_time"
## [9] "arr_delay" "carrier" "flight" "tailnum"
## [13] "origin" "dest" "air_time" "distance"
## [17] "hour" "minute" "time_hour" "long_flights"
library(readxl)
m <- read_excel("C:/Users/USER/Desktop/m.xlsx")
summary(m)
## customer_id name Product_id Order_id
## Min. :1.00 Length:24 Length:24 Length:24
## 1st Qu.:1.75 Class :character Class :character Class :character
## Median :2.50 Mode :character Mode :character Mode :character
## Mean :2.50
## 3rd Qu.:3.25
## Max. :4.00
## Unit Price Total DATE
## Min. :2.00 Min. :123.0 Min. : 400.0 Min. :2020-01-03 00:00:00
## 1st Qu.:2.75 1st Qu.:180.8 1st Qu.: 703.5 1st Qu.:2020-01-23 00:00:00
## Median :4.50 Median :329.5 Median : 907.5 Median :2020-05-18 00:00:00
## Mean :5.00 Mean :284.7 Mean :1543.3 Mean :2020-04-27 07:00:00
## 3rd Qu.:6.75 3rd Qu.:377.2 3rd Qu.:1779.8 3rd Qu.:2020-07-12 00:00:00
## Max. :9.00 Max. :450.0 Max. :4050.0 Max. :2020-11-23 00:00:00
str(m)
## tibble [24 × 8] (S3: tbl_df/tbl/data.frame)
## $ customer_id: num [1:24] 1 2 3 4 1 2 3 4 1 2 ...
## $ name : chr [1:24] "john" "peter" "james" "james" ...
## $ Product_id : chr [1:24] "p1" "p2" "p3" "p1" ...
## $ Order_id : chr [1:24] "o1" "o2" "o3" "o4" ...
## $ Unit : num [1:24] 2 3 6 9 2 3 6 9 2 3 ...
## $ Price : num [1:24] 200 359 123 432 200 359 123 432 200 359 ...
## $ Total : num [1:24] 400 1077 738 3888 400 ...
## $ DATE : POSIXct[1:24], format: "2020-01-23" "2020-07-12" ...
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
c<-sqldf("select * from m")
c
library(lubridate)
f<-sqldf("select DATE,customer_id,Product_id,Order_id,Total,AVG(Total) over() as avg_total,AVG(Total) over(partition by DATE) as ave_by_day,AVG(Total) over() as avg_by_month from m")
f
d<-month(m$DATE,label = T)
d
## [1] Jan Jul Jan Jul Jan Jul Nov Jul Jan Jul Mar Jul Jan Jul Jan Jul Jan Apr Jan
## [20] Jul Jan Jul Jan Jul
## 12 Levels: Jan < Feb < Mar < Apr < May < Jun < Jul < Aug < Sep < ... < Dec