library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
library(readr)
library(lubridate)
trains <- read_csv("C:/Users/USER/Desktop/trains.csv")
## Rows: 9800 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (14): Order_id, Order_date, Ship_date, Ship_code, Customer_id, First_nam...
## dbl  (3): Row_id, Postal_code, Sales
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
names(trains)
##  [1] "Row_id"        "Order_id"      "Order_date"    "Ship_date"    
##  [5] "Ship_code"     "Customer_id"   "First_name"    "Last_name"    
##  [9] "City"          "State"         "Postal_code"   "Region"       
## [13] "Product_id"    "Category"      "Sub_category"  "Product _name"
## [17] "Sales"
trains$Ship_date<-dmy(trains$Ship_date)
trains$Order_date<-dmy(trains$Order_date)
str(trains)
## spc_tbl_ [9,800 × 17] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Row_id       : num [1:9800] 1 2 3 4 5 6 7 8 9 10 ...
##  $ Order_id     : chr [1:9800] "CA-2017-152156" "CA-2017-152156" "CA-2017-138688" "US-2016-108966" ...
##  $ Order_date   : Date[1:9800], format: "2017-11-08" "2017-11-08" ...
##  $ Ship_date    : Date[1:9800], format: "2017-11-11" "2017-11-11" ...
##  $ Ship_code    : chr [1:9800] "Second Class" "Second Class" "Second Class" "Standard Class" ...
##  $ Customer_id  : chr [1:9800] "CG-12520" "CG-12520" "DV-13045" "SO-20335" ...
##  $ First_name   : chr [1:9800] "Claire" "Claire" "Darrin" "Sean" ...
##  $ Last_name    : chr [1:9800] "Gute" "Gute" "Van" "O'Donnell" ...
##  $ City         : chr [1:9800] "Henderson" "Henderson" "Los Angeles" "Fort Lauderdale" ...
##  $ State        : chr [1:9800] "Kentucky" "Kentucky" "California" "Florida" ...
##  $ Postal_code  : num [1:9800] 42420 42420 90036 33311 33311 ...
##  $ Region       : chr [1:9800] "South" "South" "West" "South" ...
##  $ Product_id   : chr [1:9800] "FUR-BO-10001798" "FUR-CH-10000454" "OFF-LA-10000240" "FUR-TA-10000577" ...
##  $ Category     : chr [1:9800] "Furniture" "Furniture" "Office Supplies" "Furniture" ...
##  $ Sub_category : chr [1:9800] "Bookcases" "Chairs" "Labels" "Tables" ...
##  $ Product _name: chr [1:9800] "Bush Somerset Collection Bookcase" "Hon Deluxe Fabric Upholstered Stacking Chairs, Rounded Back" "Self-Adhesive Address Labels for Typewriters by Universal" "Bretford CR4500 Series Slim Rectangular Table" ...
##  $ Sales        : num [1:9800] 262 731.9 14.6 957.6 22.4 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Row_id = col_double(),
##   ..   Order_id = col_character(),
##   ..   Order_date = col_character(),
##   ..   Ship_date = col_character(),
##   ..   Ship_code = col_character(),
##   ..   Customer_id = col_character(),
##   ..   First_name = col_character(),
##   ..   Last_name = col_character(),
##   ..   City = col_character(),
##   ..   State = col_character(),
##   ..   Postal_code = col_double(),
##   ..   Region = col_character(),
##   ..   Product_id = col_character(),
##   ..   Category = col_character(),
##   ..   Sub_category = col_character(),
##   ..   `Product _name` = col_character(),
##   ..   Sales = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>

TOTAL SALE OF THE COMPANY

summary(trains$Sales)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##     0.444    17.248    54.490   230.769   210.605 22638.480
hist<-trains%>% ggplot(aes(trains$Sales))+geom_histogram(fill="purple",alpha=0.5)+geom_density(colour="green")
hist
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

TOTAL_SALE<-sqldf("select sum(Sales) from trains")
TOTAL_SALE
##   sum(Sales)
## 1    2261537

SALES BY MONTH

sdd<-sqldf("select strftime('%m',Order_date)as month ,sum(Sales)as Total_sales from trains group by month order by Total_sales ")
sdd
##    month Total_sales
## 1     01    78308.86
## 2     12    98747.27
## 3     06   135693.49
## 4     03   139387.59
## 5     04   144303.69
## 6     05   151239.20
## 7     02   187473.33
## 8     07   195590.76
## 9     09   219579.31
## 10    11   245273.66
## 11    08   299153.39
## 12    10   366786.22
dg<-sqldf("select Order_date,sum(Sales) as Total_sales from trains
          group by Order_date limit 10")
dg
##    Order_date Total_sales
## 1  2015-01-03      16.448
## 2  2015-01-04     288.060
## 3  2015-01-05      19.536
## 4  2015-01-06    4407.100
## 5  2015-01-07      87.158
## 6  2015-01-09      40.544
## 7  2015-01-10      54.830
## 8  2015-01-11       9.940
## 9  2015-01-13    3553.795
## 10 2015-01-14      61.960
dd<-sqldf("select Total_sales,case
          when month='01'then 'Jan' 
          when month='02'then 'Feb'
          when month='03'then 'Marc'
          when month='04'then 'April'
          when month='05'then 'May'
          when month='06'then 'Jun'
          when month='07'then 'July'
          when month='08'then 'Aug'
          when month='09'then 'Sept'
          when month='10'then 'Oct'
          when month='11' then 'Nov'
          when month='12'then 'Dec'
          else 0 end as month from sdd")
dd
##    Total_sales month
## 1     78308.86   Jan
## 2     98747.27   Dec
## 3    135693.49   Jun
## 4    139387.59  Marc
## 5    144303.69 April
## 6    151239.20   May
## 7    187473.33   Feb
## 8    195590.76  July
## 9    219579.31  Sept
## 10   245273.66   Nov
## 11   299153.39   Aug
## 12   366786.22   Oct
dd$month<-fct_reorder(dd$month,dd$Total_sales)
d<-dd%>%ggplot(aes(month,Total_sales))+geom_col(fill="purple")+coord_flip()+theme_minimal()+labs(title="Total sales by Month")
d

SALES BY DAY

day<-sqldf("select sum(sales) as Daily_sales,strftime('%d',Order_date) as Day from trains group by day order by Daily_sales desc")
day
##    Daily_sales Day
## 1    125501.13  10
## 2     95078.73  26
## 3     94871.15  14
## 4     92048.42  18
## 5     91313.91  08
## 6     90849.13  01
## 7     86678.00  12
## 8     80306.44  24
## 9     80251.10  04
## 10    79652.02  27
## 11    78756.88  20
## 12    76525.35  11
## 13    75434.14  16
## 14    75159.50  25
## 15    74744.36  02
## 16    73151.13  23
## 17    72168.77  13
## 18    70929.30  06
## 19    68883.43  05
## 20    66930.59  17
## 21    65651.99  28
## 22    65413.36  07
## 23    63831.69  22
## 24    62539.78  15
## 25    57876.05  30
## 26    57009.75  21
## 27    55948.83  03
## 28    54166.93  19
## 29    44271.69  29
## 30    42848.61  09
## 31    42744.64  31

VISUALIZATION OF DAILY SALES

day$Day<-fct_reorder(day$Day,day$Daily_sales)
daily_sales<-day%>%ggplot(aes(Day,Daily_sales))+geom_col(fill="orange")+coord_flip()
daily_sales

SALE BY WEEK DAYS

sales_by_state<-sqldf("select State,avg(Sales)  from trains where Region='West' ")
sales_by_state
##        State avg(Sales)
## 1 California   226.1846

TOTAL SALES BY REGION

df<-sqldf("select Region ,sum(Sales) as Total_sales from trains group by Region order by Total_sales desc")
df
##    Region Total_sales
## 1    West    710219.7
## 2    East    669518.7
## 3 Central    492646.9
## 4   South    389151.5
df$Region<-fct_reorder(df$Region,df$Total_sales)
f<-df%>% ggplot(aes(Region,Total_sales))+geom_col(fill="blue")+coord_flip()+theme_minimal()+labs(title="Total sales by Region")
f

TOTAL SALES BY CATEGORY

s<-sqldf("select Category,sum(Sales) as Total_sales from trains
         group by Category order by Total_sales desc")
s
##          Category Total_sales
## 1      Technology    827455.9
## 2       Furniture    728658.6
## 3 Office Supplies    705422.3

VISUALIZATION OF TOTAL SALES BY CATEGORY

s$Category<-fct_reorder(s$Category,s$Total_sales)
dd<-s%>%ggplot(aes(Category,Total_sales))+geom_col(fill="purple")+coord_flip()+theme_minimal()+labs(title="Total sale by Category")
dd

TEN PRODUCTS WITH HIGHEST SALES

gg<-sqldf("select Postal_code,Category,Product_id,sum(Sales) as Total_sales from trains
          group by Postal_code,Category,Product_id order by Total_sales desc limit 10 ")
gg
##    Postal_code        Category      Product_id Total_sales
## 1        32216      Technology TEC-MA-10002412   22638.480
## 2        47905      Technology TEC-CO-10004722   17499.950
## 3        98115      Technology TEC-CO-10004722   13999.960
## 4        10024      Technology TEC-CO-10004722   11199.968
## 5        19711      Technology TEC-CO-10004722   10499.970
## 6        48205 Office Supplies OFF-BI-10000545    9892.740
## 7        55407 Office Supplies OFF-BI-10001120    9449.950
## 8         8701      Technology TEC-MA-10001047    9099.930
## 9        22204      Technology TEC-MA-10001127    8749.950
## 10       19120      Technology TEC-CO-10004722    8399.976

TEN PRODUCTS WITH LEAST SALES

g<-sqldf("select Postal_code,Category,Product_id,sum(Sales) as Total_sales from trains
          group by Postal_code,Category,Product_id order by Total_sales asc limit 10")
g
##    Postal_code        Category      Product_id Total_sales
## 1        77095 Office Supplies OFF-AP-10002906       0.444
## 2        76706 Office Supplies OFF-BI-10004022       0.556
## 3        60623 Office Supplies OFF-BI-10003727       0.836
## 4        75150 Office Supplies OFF-BI-10003460       0.876
## 5        77340 Office Supplies OFF-BI-10004140       0.898
## 6        77041 Office Supplies OFF-BI-10000201       0.984
## 7        94110      Technology TEC-AC-10003709       0.990
## 8        75217 Office Supplies OFF-BI-10000494       1.044
## 9        75220 Office Supplies OFF-BI-10002012       1.080
## 10       80013 Office Supplies OFF-BI-10002012       1.080

sales by order date

d<-s%>%ggplot(aes(Category,Total_sales))+geom_col(fill="purple")+coord_flip()+theme_minimal()+labs(title="Top 10 from bottom Total sale by Category")
d

percentage of sale by category

head(trains<-trains%>% mutate(per=Sales/sum(Sales)))
## # A tibble: 6 × 18
##   Row_id Order_id       Order_date Ship_date  Ship_code   Customer_id First_name
##    <dbl> <chr>          <date>     <date>     <chr>       <chr>       <chr>     
## 1      1 CA-2017-152156 2017-11-08 2017-11-11 Second Cla… CG-12520    Claire    
## 2      2 CA-2017-152156 2017-11-08 2017-11-11 Second Cla… CG-12520    Claire    
## 3      3 CA-2017-138688 2017-06-12 2017-06-16 Second Cla… DV-13045    Darrin    
## 4      4 US-2016-108966 2016-10-11 2016-10-18 Standard C… SO-20335    Sean      
## 5      5 US-2016-108966 2016-10-11 2016-10-18 Standard C… SO-20335    Sean      
## 6      6 CA-2015-115812 2015-06-09 2015-06-14 Standard C… BH-11710    Brosina   
## # ℹ 11 more variables: Last_name <chr>, City <chr>, State <chr>,
## #   Postal_code <dbl>, Region <chr>, Product_id <chr>, Category <chr>,
## #   Sub_category <chr>, `Product _name` <chr>, Sales <dbl>, per <dbl>
head(trains<-trains%>% mutate(per=per*100))
## # A tibble: 6 × 18
##   Row_id Order_id       Order_date Ship_date  Ship_code   Customer_id First_name
##    <dbl> <chr>          <date>     <date>     <chr>       <chr>       <chr>     
## 1      1 CA-2017-152156 2017-11-08 2017-11-11 Second Cla… CG-12520    Claire    
## 2      2 CA-2017-152156 2017-11-08 2017-11-11 Second Cla… CG-12520    Claire    
## 3      3 CA-2017-138688 2017-06-12 2017-06-16 Second Cla… DV-13045    Darrin    
## 4      4 US-2016-108966 2016-10-11 2016-10-18 Standard C… SO-20335    Sean      
## 5      5 US-2016-108966 2016-10-11 2016-10-18 Standard C… SO-20335    Sean      
## 6      6 CA-2015-115812 2015-06-09 2015-06-14 Standard C… BH-11710    Brosina   
## # ℹ 11 more variables: Last_name <chr>, City <chr>, State <chr>,
## #   Postal_code <dbl>, Region <chr>, Product_id <chr>, Category <chr>,
## #   Sub_category <chr>, `Product _name` <chr>, Sales <dbl>, per <dbl>
ssss<-sqldf("select per*100 from trains limit 4")
ssss
##      per*100
## 1 1.15832739
## 2 3.23647179
## 3 0.06464631
## 4 4.23418937
sss<-sqldf("select Category,per from trains
           group by Category order by per desc")
sss
##          Category          per
## 1      Technology 0.0401121930
## 2       Furniture 0.0115832739
## 3 Office Supplies 0.0006464631