library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
library(readr)
library(lubridate)
trains <- read_csv("C:/Users/USER/Desktop/trains.csv")
## Rows: 9800 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (14): Order_id, Order_date, Ship_date, Ship_code, Customer_id, First_nam...
## dbl (3): Row_id, Postal_code, Sales
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
names(trains)
## [1] "Row_id" "Order_id" "Order_date" "Ship_date"
## [5] "Ship_code" "Customer_id" "First_name" "Last_name"
## [9] "City" "State" "Postal_code" "Region"
## [13] "Product_id" "Category" "Sub_category" "Product _name"
## [17] "Sales"
trains$Ship_date<-dmy(trains$Ship_date)
trains$Order_date<-dmy(trains$Order_date)
str(trains)
## spc_tbl_ [9,800 × 17] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Row_id : num [1:9800] 1 2 3 4 5 6 7 8 9 10 ...
## $ Order_id : chr [1:9800] "CA-2017-152156" "CA-2017-152156" "CA-2017-138688" "US-2016-108966" ...
## $ Order_date : Date[1:9800], format: "2017-11-08" "2017-11-08" ...
## $ Ship_date : Date[1:9800], format: "2017-11-11" "2017-11-11" ...
## $ Ship_code : chr [1:9800] "Second Class" "Second Class" "Second Class" "Standard Class" ...
## $ Customer_id : chr [1:9800] "CG-12520" "CG-12520" "DV-13045" "SO-20335" ...
## $ First_name : chr [1:9800] "Claire" "Claire" "Darrin" "Sean" ...
## $ Last_name : chr [1:9800] "Gute" "Gute" "Van" "O'Donnell" ...
## $ City : chr [1:9800] "Henderson" "Henderson" "Los Angeles" "Fort Lauderdale" ...
## $ State : chr [1:9800] "Kentucky" "Kentucky" "California" "Florida" ...
## $ Postal_code : num [1:9800] 42420 42420 90036 33311 33311 ...
## $ Region : chr [1:9800] "South" "South" "West" "South" ...
## $ Product_id : chr [1:9800] "FUR-BO-10001798" "FUR-CH-10000454" "OFF-LA-10000240" "FUR-TA-10000577" ...
## $ Category : chr [1:9800] "Furniture" "Furniture" "Office Supplies" "Furniture" ...
## $ Sub_category : chr [1:9800] "Bookcases" "Chairs" "Labels" "Tables" ...
## $ Product _name: chr [1:9800] "Bush Somerset Collection Bookcase" "Hon Deluxe Fabric Upholstered Stacking Chairs, Rounded Back" "Self-Adhesive Address Labels for Typewriters by Universal" "Bretford CR4500 Series Slim Rectangular Table" ...
## $ Sales : num [1:9800] 262 731.9 14.6 957.6 22.4 ...
## - attr(*, "spec")=
## .. cols(
## .. Row_id = col_double(),
## .. Order_id = col_character(),
## .. Order_date = col_character(),
## .. Ship_date = col_character(),
## .. Ship_code = col_character(),
## .. Customer_id = col_character(),
## .. First_name = col_character(),
## .. Last_name = col_character(),
## .. City = col_character(),
## .. State = col_character(),
## .. Postal_code = col_double(),
## .. Region = col_character(),
## .. Product_id = col_character(),
## .. Category = col_character(),
## .. Sub_category = col_character(),
## .. `Product _name` = col_character(),
## .. Sales = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
TOTAL SALE OF THE COMPANY
summary(trains$Sales)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.444 17.248 54.490 230.769 210.605 22638.480
hist<-trains%>% ggplot(aes(trains$Sales))+geom_histogram(fill="purple",alpha=0.5)+geom_density(colour="green")
hist
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
TOTAL_SALE<-sqldf("select sum(Sales) from trains")
TOTAL_SALE
## sum(Sales)
## 1 2261537
SALES BY MONTH
sdd<-sqldf("select strftime('%m',Order_date)as month ,sum(Sales)as Total_sales from trains group by month order by Total_sales ")
sdd
## month Total_sales
## 1 01 78308.86
## 2 12 98747.27
## 3 06 135693.49
## 4 03 139387.59
## 5 04 144303.69
## 6 05 151239.20
## 7 02 187473.33
## 8 07 195590.76
## 9 09 219579.31
## 10 11 245273.66
## 11 08 299153.39
## 12 10 366786.22
dg<-sqldf("select Order_date,sum(Sales) as Total_sales from trains
group by Order_date limit 10")
dg
## Order_date Total_sales
## 1 2015-01-03 16.448
## 2 2015-01-04 288.060
## 3 2015-01-05 19.536
## 4 2015-01-06 4407.100
## 5 2015-01-07 87.158
## 6 2015-01-09 40.544
## 7 2015-01-10 54.830
## 8 2015-01-11 9.940
## 9 2015-01-13 3553.795
## 10 2015-01-14 61.960
dd<-sqldf("select Total_sales,case
when month='01'then 'Jan'
when month='02'then 'Feb'
when month='03'then 'Marc'
when month='04'then 'April'
when month='05'then 'May'
when month='06'then 'Jun'
when month='07'then 'July'
when month='08'then 'Aug'
when month='09'then 'Sept'
when month='10'then 'Oct'
when month='11' then 'Nov'
when month='12'then 'Dec'
else 0 end as month from sdd")
dd
## Total_sales month
## 1 78308.86 Jan
## 2 98747.27 Dec
## 3 135693.49 Jun
## 4 139387.59 Marc
## 5 144303.69 April
## 6 151239.20 May
## 7 187473.33 Feb
## 8 195590.76 July
## 9 219579.31 Sept
## 10 245273.66 Nov
## 11 299153.39 Aug
## 12 366786.22 Oct
dd$month<-fct_reorder(dd$month,dd$Total_sales)
d<-dd%>%ggplot(aes(month,Total_sales))+geom_col(fill="purple")+coord_flip()+theme_minimal()+labs(title="Total sales by Month")
d
SALES BY DAY
day<-sqldf("select sum(sales) as Daily_sales,strftime('%d',Order_date) as Day from trains group by day order by Daily_sales desc")
day
## Daily_sales Day
## 1 125501.13 10
## 2 95078.73 26
## 3 94871.15 14
## 4 92048.42 18
## 5 91313.91 08
## 6 90849.13 01
## 7 86678.00 12
## 8 80306.44 24
## 9 80251.10 04
## 10 79652.02 27
## 11 78756.88 20
## 12 76525.35 11
## 13 75434.14 16
## 14 75159.50 25
## 15 74744.36 02
## 16 73151.13 23
## 17 72168.77 13
## 18 70929.30 06
## 19 68883.43 05
## 20 66930.59 17
## 21 65651.99 28
## 22 65413.36 07
## 23 63831.69 22
## 24 62539.78 15
## 25 57876.05 30
## 26 57009.75 21
## 27 55948.83 03
## 28 54166.93 19
## 29 44271.69 29
## 30 42848.61 09
## 31 42744.64 31
VISUALIZATION OF DAILY SALES
day$Day<-fct_reorder(day$Day,day$Daily_sales)
daily_sales<-day%>%ggplot(aes(Day,Daily_sales))+geom_col(fill="orange")+coord_flip()
daily_sales
SALE BY WEEK DAYS
sales_by_state<-sqldf("select State,avg(Sales) from trains where Region='West' ")
sales_by_state
## State avg(Sales)
## 1 California 226.1846
TOTAL SALES BY REGION
df<-sqldf("select Region ,sum(Sales) as Total_sales from trains group by Region order by Total_sales desc")
df
## Region Total_sales
## 1 West 710219.7
## 2 East 669518.7
## 3 Central 492646.9
## 4 South 389151.5
df$Region<-fct_reorder(df$Region,df$Total_sales)
f<-df%>% ggplot(aes(Region,Total_sales))+geom_col(fill="blue")+coord_flip()+theme_minimal()+labs(title="Total sales by Region")
f
TOTAL SALES BY CATEGORY
s<-sqldf("select Category,sum(Sales) as Total_sales from trains
group by Category order by Total_sales desc")
s
## Category Total_sales
## 1 Technology 827455.9
## 2 Furniture 728658.6
## 3 Office Supplies 705422.3
VISUALIZATION OF TOTAL SALES BY CATEGORY
s$Category<-fct_reorder(s$Category,s$Total_sales)
dd<-s%>%ggplot(aes(Category,Total_sales))+geom_col(fill="purple")+coord_flip()+theme_minimal()+labs(title="Total sale by Category")
dd
TEN PRODUCTS WITH HIGHEST SALES
gg<-sqldf("select Postal_code,Category,Product_id,sum(Sales) as Total_sales from trains
group by Postal_code,Category,Product_id order by Total_sales desc limit 10 ")
gg
## Postal_code Category Product_id Total_sales
## 1 32216 Technology TEC-MA-10002412 22638.480
## 2 47905 Technology TEC-CO-10004722 17499.950
## 3 98115 Technology TEC-CO-10004722 13999.960
## 4 10024 Technology TEC-CO-10004722 11199.968
## 5 19711 Technology TEC-CO-10004722 10499.970
## 6 48205 Office Supplies OFF-BI-10000545 9892.740
## 7 55407 Office Supplies OFF-BI-10001120 9449.950
## 8 8701 Technology TEC-MA-10001047 9099.930
## 9 22204 Technology TEC-MA-10001127 8749.950
## 10 19120 Technology TEC-CO-10004722 8399.976
TEN PRODUCTS WITH LEAST SALES
g<-sqldf("select Postal_code,Category,Product_id,sum(Sales) as Total_sales from trains
group by Postal_code,Category,Product_id order by Total_sales asc limit 10")
g
## Postal_code Category Product_id Total_sales
## 1 77095 Office Supplies OFF-AP-10002906 0.444
## 2 76706 Office Supplies OFF-BI-10004022 0.556
## 3 60623 Office Supplies OFF-BI-10003727 0.836
## 4 75150 Office Supplies OFF-BI-10003460 0.876
## 5 77340 Office Supplies OFF-BI-10004140 0.898
## 6 77041 Office Supplies OFF-BI-10000201 0.984
## 7 94110 Technology TEC-AC-10003709 0.990
## 8 75217 Office Supplies OFF-BI-10000494 1.044
## 9 75220 Office Supplies OFF-BI-10002012 1.080
## 10 80013 Office Supplies OFF-BI-10002012 1.080
sales by order date
d<-s%>%ggplot(aes(Category,Total_sales))+geom_col(fill="purple")+coord_flip()+theme_minimal()+labs(title="Top 10 from bottom Total sale by Category")
d
percentage of sale by category
head(trains<-trains%>% mutate(per=Sales/sum(Sales)))
## # A tibble: 6 × 18
## Row_id Order_id Order_date Ship_date Ship_code Customer_id First_name
## <dbl> <chr> <date> <date> <chr> <chr> <chr>
## 1 1 CA-2017-152156 2017-11-08 2017-11-11 Second Cla… CG-12520 Claire
## 2 2 CA-2017-152156 2017-11-08 2017-11-11 Second Cla… CG-12520 Claire
## 3 3 CA-2017-138688 2017-06-12 2017-06-16 Second Cla… DV-13045 Darrin
## 4 4 US-2016-108966 2016-10-11 2016-10-18 Standard C… SO-20335 Sean
## 5 5 US-2016-108966 2016-10-11 2016-10-18 Standard C… SO-20335 Sean
## 6 6 CA-2015-115812 2015-06-09 2015-06-14 Standard C… BH-11710 Brosina
## # ℹ 11 more variables: Last_name <chr>, City <chr>, State <chr>,
## # Postal_code <dbl>, Region <chr>, Product_id <chr>, Category <chr>,
## # Sub_category <chr>, `Product _name` <chr>, Sales <dbl>, per <dbl>
head(trains<-trains%>% mutate(per=per*100))
## # A tibble: 6 × 18
## Row_id Order_id Order_date Ship_date Ship_code Customer_id First_name
## <dbl> <chr> <date> <date> <chr> <chr> <chr>
## 1 1 CA-2017-152156 2017-11-08 2017-11-11 Second Cla… CG-12520 Claire
## 2 2 CA-2017-152156 2017-11-08 2017-11-11 Second Cla… CG-12520 Claire
## 3 3 CA-2017-138688 2017-06-12 2017-06-16 Second Cla… DV-13045 Darrin
## 4 4 US-2016-108966 2016-10-11 2016-10-18 Standard C… SO-20335 Sean
## 5 5 US-2016-108966 2016-10-11 2016-10-18 Standard C… SO-20335 Sean
## 6 6 CA-2015-115812 2015-06-09 2015-06-14 Standard C… BH-11710 Brosina
## # ℹ 11 more variables: Last_name <chr>, City <chr>, State <chr>,
## # Postal_code <dbl>, Region <chr>, Product_id <chr>, Category <chr>,
## # Sub_category <chr>, `Product _name` <chr>, Sales <dbl>, per <dbl>
ssss<-sqldf("select per*100 from trains limit 4")
ssss
## per*100
## 1 1.15832739
## 2 3.23647179
## 3 0.06464631
## 4 4.23418937
sss<-sqldf("select Category,per from trains
group by Category order by per desc")
sss
## Category per
## 1 Technology 0.0401121930
## 2 Furniture 0.0115832739
## 3 Office Supplies 0.0006464631