Part(a)

Clean the data if necessary (e.g., fix incorrect or inconsistent values, or change them to NA’s if no obvious fix)

library(readxl)
Retail_csv <- read_excel("~/Retail.csv.xlsx")

Retail <- ifelse(Retail_csv$Description %in% c("?", "??", "???","?sold as sets?","?lost","wet", "sold in set?","?missing", "missing?","???lost","POSSIBLE DAMAGES OR LOST?", "? sold as sets?","???missing", "wet?", "lost??","damages?", "???damages???", "check?","damages/dotcom?", "historic computer difference?.....se","historic computer difference?....se", "?display","?code mix up?84930","?? missing","????damages????","????missing", "?display?", "wrong code?", "code mix up? 84930"), NA, Retail_csv$Description)

Retail_csv$Description <- Retail

New_table <- Retail_csv

New_table_Quantity <- replace(New_table$Quantity, New_table$Quantity < 0,NA)

New_table_UnitPrice <- replace(New_table$UnitPrice,New_table$UnitPrice < 0, NA )

New_table[,c(4,6)] <- data.frame(New_table_Quantity,New_table_UnitPrice)

summary(New_table)
##   InvoiceNo          StockCode         Description           Quantity       
##  Length:541909      Length:541909      Length:541909      Min.   :    1.00  
##  Class :character   Class :character   Class :character   1st Qu.:    1.00  
##  Mode  :character   Mode  :character   Mode  :character   Median :    3.00  
##                                                           Mean   :   10.66  
##                                                           3rd Qu.:   10.00  
##                                                           Max.   :80995.00  
##                                                           NA's   :10624     
##   InvoiceDate                    UnitPrice          CustomerID    
##  Min.   :2010-12-01 08:26:00   Min.   :    0.00   Min.   :12346   
##  1st Qu.:2011-03-28 11:34:00   1st Qu.:    1.25   1st Qu.:13953   
##  Median :2011-07-19 17:17:00   Median :    2.08   Median :15152   
##  Mean   :2011-07-04 13:34:57   Mean   :    4.65   Mean   :15288   
##  3rd Qu.:2011-10-19 11:27:00   3rd Qu.:    4.13   3rd Qu.:16791   
##  Max.   :2011-12-09 12:50:00   Max.   :38970.00   Max.   :18287   
##                                NA's   :2          NA's   :135080  
##    Country         
##  Length:541909     
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
## 
New_table[1:20,]

Part(b)

Add a revenue column in R (multiply unit price and quantity)

project <- data.frame(New_table)

project <- data.frame(New_table)

New_col_project <-mutate(project, Revenue= project$Quantity *project$UnitPrice, Inmonth = substring(project$InvoiceDate,6,7))

New_col_project[1:20,]
df = sqldf("select Inmonth, sum(Revenue) as total from New_col_project group by Inmonth")

df

Part(c)

Create a bar plot for the sum of revenues by month.

library(ggplot2)
ggplot(df,aes(x=Inmonth, y=total))+
  geom_col(width = 10, color = "Red")+
  ggtitle("My Bar Plot")+ 
  theme(plot.title = element_text(hjust = 0.5))
## Warning: position_stack requires non-overlapping x intervals