```We are using Covid-19 Dataset from Kaggle

Loading the dataset into R dataframe

df <- read.csv("covid_19_data.csv")

Import required libraries

library(ggplot2)
library(ggpubr)

df <- read.csv(“amazon_data.csv”) # We are using AMAZON Stock price dataset from Kaggle

Display the size of the dataset

nrow(df)
## [1] 306429
ncol(df)
## [1] 8

This dataset has 6135 observations of 7 variables

display the first 10 rows of the dataset

head(df, 10)
##    SNo ObservationDate Province.State Country.Region     Last.Update Confirmed
## 1    1      01/22/2020          Anhui Mainland China 1/22/2020 17:00         1
## 2    2      01/22/2020        Beijing Mainland China 1/22/2020 17:00        14
## 3    3      01/22/2020      Chongqing Mainland China 1/22/2020 17:00         6
## 4    4      01/22/2020         Fujian Mainland China 1/22/2020 17:00         1
## 5    5      01/22/2020          Gansu Mainland China 1/22/2020 17:00         0
## 6    6      01/22/2020      Guangdong Mainland China 1/22/2020 17:00        26
## 7    7      01/22/2020        Guangxi Mainland China 1/22/2020 17:00         2
## 8    8      01/22/2020        Guizhou Mainland China 1/22/2020 17:00         1
## 9    9      01/22/2020         Hainan Mainland China 1/22/2020 17:00         4
## 10  10      01/22/2020          Hebei Mainland China 1/22/2020 17:00         1
##    Deaths Recovered
## 1       0         0
## 2       0         0
## 3       0         0
## 4       0         0
## 5       0         0
## 6       0         0
## 7       0         0
## 8       0         0
## 9       0         0
## 10      0         0

Display all the column names of the dataset

names(df)
## [1] "SNo"             "ObservationDate" "Province.State"  "Country.Region" 
## [5] "Last.Update"     "Confirmed"       "Deaths"          "Recovered"
summary.default
## function (object, ..., digits, quantile.type = 7) 
## {
##     if (is.factor(object)) 
##         return(summary.factor(object, ...))
##     else if (is.matrix(object)) {
##         if (missing(digits)) 
##             return(summary.matrix(object, quantile.type = quantile.type, 
##                 ...))
##         else return(summary.matrix(object, digits = digits, quantile.type = quantile.type, 
##             ...))
##     }
##     value <- if (is.logical(object)) 
##         c(Mode = "logical", {
##             tb <- table(object, exclude = NULL, useNA = "ifany")
##             if (!is.null(n <- dimnames(tb)[[1L]]) && any(iN <- is.na(n))) dimnames(tb)[[1L]][iN] <- "NA's"
##             tb
##         })
##     else if (is.numeric(object)) {
##         nas <- is.na(object)
##         object <- object[!nas]
##         qq <- stats::quantile(object, names = FALSE, type = quantile.type)
##         qq <- c(qq[1L:3L], mean(object), qq[4L:5L])
##         if (!missing(digits)) 
##             qq <- signif(qq, digits)
##         names(qq) <- c("Min.", "1st Qu.", "Median", "Mean", "3rd Qu.", 
##             "Max.")
##         if (any(nas)) 
##             c(qq, `NA's` = sum(nas))
##         else qq
##     }
##     else if (is.recursive(object) && !is.language(object) && 
##         (n <- length(object))) {
##         sumry <- array("", c(n, 3L), list(names(object), c("Length", 
##             "Class", "Mode")))
##         ll <- numeric(n)
##         for (i in 1L:n) {
##             ii <- object[[i]]
##             ll[i] <- length(ii)
##             cls <- oldClass(ii)
##             sumry[i, 2L] <- if (length(cls)) 
##                 cls[1L]
##             else "-none-"
##             sumry[i, 3L] <- mode(ii)
##         }
##         sumry[, 1L] <- format(as.integer(ll))
##         sumry
##     }
##     else c(Length = length(object), Class = class(object), Mode = mode(object))
##     class(value) <- c("summaryDefault", "table")
##     value
## }
## <bytecode: 0x00000000156563f8>
## <environment: namespace:base>

We are interested in “Confirmed” variable.

summary(df$Confirmed)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -302844    1042   10375   85671   50752 5863138

Plot the histogram of the “Confirmed” variable

hist(df$Confirmed)

hist(log(df$Confirmed))
## Warning in log(df$Confirmed): NaNs produced

Plot the boxplot of the “Confirmed” variable

boxplot(df$Confirmed)

boxplot(log(df$Confirmed))
## Warning in log(df$Confirmed): NaNs produced
## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z$group
## == : Outlier (-Inf) in boxplot 1 is not drawn

qqplot(df$Confirmed, df$Recovered)
qqline(df$Confirmed)

ggplot(data=df, aes(x=1:nrow(df), y=Confirmed, group=1)) + geom_line()

ggqqplot(df$Confirmed)

Based on the histograms, Q-Q plots, data summary, and boxplots, the variable “Confirmed” follows normal distribution.