Clean the data if necessary (e.g., fix incorrect or inconsistent values, or change them to NAβs if no obvious fix)
library(readxl)
Retail_csv <- read_excel("~/Retail.csv.xlsx")
Retail <- ifelse(Retail_csv$Description %in% c("?", "??", "???","?sold as sets?","?lost","wet", "sold in set?","?missing", "missing?","???lost","POSSIBLE DAMAGES OR LOST?", "? sold as sets?","???missing", "wet?", "lost??","damages?", "???damages???", "check?","damages/dotcom?", "historic computer difference?.....se","historic computer difference?....se", "?display","?code mix up?84930","?? missing","????damages????","????missing", "?display?", "wrong code?", "code mix up? 84930"), NA, Retail_csv$Description)
Retail_csv$Description <- Retail
New_table <- Retail_csv
New_table_Quantity <- replace(New_table$Quantity, New_table$Quantity < 0,NA)
New_table_UnitPrice <- replace(New_table$UnitPrice,New_table$UnitPrice < 0, NA )
New_table[,c(4,6)] <- data.frame(New_table_Quantity,New_table_UnitPrice)
summary(New_table)
## InvoiceNo StockCode Description Quantity
## Length:541909 Length:541909 Length:541909 Min. : 1.00
## Class :character Class :character Class :character 1st Qu.: 1.00
## Mode :character Mode :character Mode :character Median : 3.00
## Mean : 10.66
## 3rd Qu.: 10.00
## Max. :80995.00
## NA's :10624
## InvoiceDate UnitPrice CustomerID
## Min. :2010-12-01 08:26:00 Min. : 0.00 Min. :12346
## 1st Qu.:2011-03-28 11:34:00 1st Qu.: 1.25 1st Qu.:13953
## Median :2011-07-19 17:17:00 Median : 2.08 Median :15152
## Mean :2011-07-04 13:34:57 Mean : 4.65 Mean :15288
## 3rd Qu.:2011-10-19 11:27:00 3rd Qu.: 4.13 3rd Qu.:16791
## Max. :2011-12-09 12:50:00 Max. :38970.00 Max. :18287
## NA's :2 NA's :135080
## Country
## Length:541909
## Class :character
## Mode :character
##
##
##
##
New_table[1:20,]
Add a revenue column in R (multiply unit price and quantity)
project <- data.frame(New_table)
project <- data.frame(New_table)
New_col_project <-mutate(project, Revenue= project$Quantity *project$UnitPrice, Inmonth = substring(project$InvoiceDate,6,7))
New_col_project[1:20,]
df = sqldf("select Inmonth, sum(Revenue) as total from New_col_project group by Inmonth")
df
Create a bar plot for the sum of revenues by month.
library(ggplot2)
ggplot(df,aes(x=Inmonth, y=total))+
geom_col(width = 10, color = "Red")+
ggtitle("My Bar Plot")+
theme(plot.title = element_text(hjust = 0.5))
## Warning: position_stack requires non-overlapping x intervals