```We are using Covid-19 Dataset from Kaggle

Loading the dataset into R dataframe

df <- read.csv("covid_19_data.csv")

Import required libraries

library(ggplot2)
library(ggpubr)

df <- read.csv(“amazon_data.csv”) # We are using AMAZON Stock price dataset from Kaggle

Display the size of the dataset

nrow(df)

## [1] 306429

ncol(df)

## [1] 8

This dataset has 6135 observations of 7 variables

display the first 10 rows of the dataset

head(df, 10)

##    SNo ObservationDate Province.State Country.Region     Last.Update Confirmed
## 1    1      01/22/2020          Anhui Mainland China 1/22/2020 17:00         1
## 2    2      01/22/2020        Beijing Mainland China 1/22/2020 17:00        14
## 3    3      01/22/2020      Chongqing Mainland China 1/22/2020 17:00         6
## 4    4      01/22/2020         Fujian Mainland China 1/22/2020 17:00         1
## 5    5      01/22/2020          Gansu Mainland China 1/22/2020 17:00         0
## 6    6      01/22/2020      Guangdong Mainland China 1/22/2020 17:00        26
## 7    7      01/22/2020        Guangxi Mainland China 1/22/2020 17:00         2
## 8    8      01/22/2020        Guizhou Mainland China 1/22/2020 17:00         1
## 9    9      01/22/2020         Hainan Mainland China 1/22/2020 17:00         4
## 10  10      01/22/2020          Hebei Mainland China 1/22/2020 17:00         1
##    Deaths Recovered
## 1       0         0
## 2       0         0
## 3       0         0
## 4       0         0
## 5       0         0
## 6       0         0
## 7       0         0
## 8       0         0
## 9       0         0
## 10      0         0

Display all the column names of the dataset

names(df)

## [1] "SNo"             "ObservationDate" "Province.State"  "Country.Region" 
## [5] "Last.Update"     "Confirmed"       "Deaths"          "Recovered"

summary.default

## function (object, ..., digits, quantile.type = 7) 
## {
##     if (is.factor(object)) 
##         return(summary.factor(object, ...))
##     else if (is.matrix(object)) {
##         if (missing(digits)) 
##             return(summary.matrix(object, quantile.type = quantile.type, 
##                 ...))
##         else return(summary.matrix(object, digits = digits, quantile.type = quantile.type, 
##             ...))
##     }
##     value <- if (is.logical(object)) 
##         c(Mode = "logical", {
##             tb <- table(object, exclude = NULL, useNA = "ifany")
##             if (!is.null(n <- dimnames(tb)[[1L]]) && any(iN <- is.na(n))) dimnames(tb)[[1L]][iN] <- "NA's"
##             tb
##         })
##     else if (is.numeric(object)) {
##         nas <- is.na(object)
##         object <- object[!nas]
##         qq <- stats::quantile(object, names = FALSE, type = quantile.type)
##         qq <- c(qq[1L:3L], mean(object), qq[4L:5L])
##         if (!missing(digits)) 
##             qq <- signif(qq, digits)
##         names(qq) <- c("Min.", "1st Qu.", "Median", "Mean", "3rd Qu.", 
##             "Max.")
##         if (any(nas)) 
##             c(qq, `NA's` = sum(nas))
##         else qq
##     }
##     else if (is.recursive(object) && !is.language(object) && 
##         (n <- length(object))) {
##         sumry <- array("", c(n, 3L), list(names(object), c("Length", 
##             "Class", "Mode")))
##         ll <- numeric(n)
##         for (i in 1L:n) {
##             ii <- object[[i]]
##             ll[i] <- length(ii)
##             cls <- oldClass(ii)
##             sumry[i, 2L] <- if (length(cls)) 
##                 cls[1L]
##             else "-none-"
##             sumry[i, 3L] <- mode(ii)
##         }
##         sumry[, 1L] <- format(as.integer(ll))
##         sumry
##     }
##     else c(Length = length(object), Class = class(object), Mode = mode(object))
##     class(value) <- c("summaryDefault", "table")
##     value
## }
## <bytecode: 0x00000000156563f8>
## <environment: namespace:base>

We are interested in “Confirmed” variable.

summary(df$Confirmed)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -302844    1042   10375   85671   50752 5863138

Plot the histogram of the “Confirmed” variable

hist(df$Confirmed)

hist(log(df$Confirmed))

## Warning in log(df$Confirmed): NaNs produced

Plot the boxplot of the “Confirmed” variable

boxplot(df$Confirmed)

boxplot(log(df$Confirmed))

## Warning in log(df$Confirmed): NaNs produced

## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z$group
## == : Outlier (-Inf) in boxplot 1 is not drawn

qqplot(df$Confirmed, df$Recovered)
qqline(df$Confirmed)

ggplot(data=df, aes(x=1:nrow(df), y=Confirmed, group=1)) + geom_line()

ggqqplot(df$Confirmed)

Based on the histograms, Q-Q plots, data summary, and boxplots, the variable “Confirmed” follows normal distribution.

Test 4

Namrata, Shruti, Elmira -

13, 10, 2021

Loading the dataset into R dataframe

Import required libraries

Display the size of the dataset

This dataset has 6135 observations of 7 variables

display the first 10 rows of the dataset

Display all the column names of the dataset

We are interested in “Confirmed” variable.

Plot the histogram of the “Confirmed” variable

Plot the boxplot of the “Confirmed” variable

Based on the histograms, Q-Q plots, data summary, and boxplots, the variable “Confirmed” follows normal distribution.