This Workshop includes:
* Read a data set in RStudio
* Find missing values and change them by the median of the column.
* Remove the row of the missing value if it is a categorical variable.
* Create a boxplot of every numerical variable.
* Remove outliers using boxplot
* Rename the columns
* Create new columns
* Plot a correlation matrix as a heatmap
* Plot Scatter Plot with tree variables (x, y and colour)
* Plot a histogram of one variable

Install Packages

We have to install the packages that we are going to use. It’s better to run them directly into the console.

#install.packages("readr") 
#install.packages("ggplot2") 
#install.packages("reshape2")

Call Packages

Using the library function, we call the packages. That means: “Hey, machine, prepare to use this packages”

library(readr)
library(reshape2)
library(ggplot2)
## Registered S3 methods overwritten by 'ggplot2':
##   method         from 
##   [.quosures     rlang
##   c.quosures     rlang
##   print.quosures rlang

Read and Import the Data

After run these lines, the new DataSet will appear into the Environment with some information.

DataSet<- read.csv("exercise.csv") #import data set
head(DataSet) #Check Data
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5           NA         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa

Find missing values

Use the function is.na to search for NAs. After that, use sum() to show how many. Its good to check them using summary()

my_na <- is.na(DataSet) #Search NA
sum(my_na) #How many NA
## [1] 15
summary(DataSet) #How many per column 
##   Sepal.Length      Sepal.Width       Petal.Length      Petal.Width     
##  Min.   :  4.300   Min.   :  2.000   Min.   :  1.000   Min.   :  0.100  
##  1st Qu.:  5.100   1st Qu.:  2.800   1st Qu.:  1.600   1st Qu.:  0.300  
##  Median :  5.800   Median :  3.000   Median :  4.300   Median :  1.300  
##  Mean   :  6.529   Mean   :  4.418   Mean   :  5.796   Mean   :  3.939  
##  3rd Qu.:  6.400   3rd Qu.:  3.300   3rd Qu.:  5.100   3rd Qu.:  1.800  
##  Max.   :106.800   Max.   :203.500   Max.   :304.400   Max.   :401.500  
##  NA's   :3         NA's   :3         NA's   :3         NA's   :3        
##        Species  
##  setosa    :48  
##  versicolor:50  
##  virginica :49  
##  NA's      : 3  
##                 
##                 
## 

As, you can see it has 3 NA’s for each feature. Total: 15.

Change NA x Median

Using these lines, we are telling: “look for the median for each feature and put this number into the cells that you have NAs in the same feature”.

DataSet$Sepal.Length[is.na(DataSet$Sepal.Length)] <- median(DataSet$Sepal.Length, na.rm=TRUE)
DataSet$Sepal.Width[is.na(DataSet$Sepal.Width)] <- median(DataSet$Sepal.Width, na.rm=TRUE)
DataSet$Petal.Length[is.na(DataSet$Petal.Length)] <- median(DataSet$Petal.Length, na.rm=TRUE)
DataSet$Petal.Width[is.na(DataSet$Petal.Width)] <- median(DataSet$Petal.Width, na.rm=TRUE)

Deleting Rows with NA

The column Species is categorial and (in this case) we can delete the rows that has NA’s.

DataSet <- na.omit(DataSet)
summary(DataSet)
##   Sepal.Length      Sepal.Width       Petal.Length      Petal.Width     
##  Min.   :  4.300   Min.   :  2.000   Min.   :  1.000   Min.   :  0.100  
##  1st Qu.:  5.150   1st Qu.:  2.800   1st Qu.:  1.600   1st Qu.:  0.300  
##  Median :  5.800   Median :  3.000   Median :  4.300   Median :  1.300  
##  Mean   :  6.539   Mean   :  4.415   Mean   :  5.829   Mean   :  3.947  
##  3rd Qu.:  6.400   3rd Qu.:  3.300   3rd Qu.:  5.100   3rd Qu.:  1.800  
##  Max.   :106.800   Max.   :203.500   Max.   :304.400   Max.   :401.500  
##        Species  
##  setosa    :48  
##  versicolor:50  
##  virginica :49  
##                 
##                 
## 

Create BoxPlot and Remove Outliers

Sepal.Lenght

Let’s see what happens with the feature Sepal.Lenght:

boxplot(DataSet$Sepal.Length) #View

As you can see, it has one outlier. Let’s identify it…

boxplot.stats(DataSet$Sepal.Length)$out #Check
## [1] 106.8

Thats it! Now, I created a new table to save all the outliers and I add it

outliers_SL <- boxplot.stats(DataSet$Sepal.Length)$out  #define outlier
DataSet[which(DataSet$Sepal.Length %in% outliers_SL),] #check row
##     Sepal.Length Sepal.Width Petal.Length Petal.Width   Species
## 113        106.8           3          5.5         2.1 virginica

After save the outlier, we can delete from our main DataSet

DataSet <- DataSet[-which(DataSet$Sepal.Length %in% outliers_SL),] #delete

Now, check the boxplot again:

boxplot(DataSet$Sepal.Length) #View

We have repeat the process for every feature.

Sepal.Width

boxplot(DataSet$Sepal.Width) #View

boxplot.stats(DataSet$Sepal.Width,)$out #Check
## [1]   4.4 203.5   4.1   4.2   2.0
outliers_SW <- boxplot.stats(DataSet$Sepal.Width,)$out  #define outlier
DataSet[which(DataSet$Sepal.Width %in% outliers_SW),] #check row
##    Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 16          5.7         4.4          1.5         0.4     setosa
## 28          5.2       203.5          1.5         1.3     setosa
## 33          5.2         4.1          1.5         0.1     setosa
## 34          5.5         4.2          1.4         0.2     setosa
## 61          5.0         2.0          3.5         1.0 versicolor
DataSet <- DataSet[-which(DataSet$Sepal.Width %in% outliers_SW),] #delete

Petal.Lenght

boxplot(DataSet$Petal.Length) #View

boxplot.stats(DataSet$Petal.Length,)$out #Check
## [1] 304.4
outliers_PL <- boxplot.stats(DataSet$Petal.Length,)$out  #define outlier
DataSet[which(DataSet$Petal.Length %in% outliers_PL),] #check row
##    Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 88          6.3         2.3        304.4         1.3 versicolor
DataSet <- DataSet[-which(DataSet$Petal.Length %in% outliers_PL),] #delete

Petal.Width

boxplot(DataSet$Petal.Width) #View

boxplot.stats(DataSet$Petal.Width,)$out #Check
## [1] 401.5
outliers_PW <- boxplot.stats(DataSet$Petal.Width,)$out  #define outlier
DataSet[which(DataSet$Petal.Width %in% outliers_PW),] #check row
##    Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 67          5.6           3          4.5       401.5 versicolor
DataSet <- DataSet[-which(DataSet$Petal.Width %in% outliers_PW),] #delete

Rename Species (Plants)

New name = Plants

colnames(DataSet)[colnames(DataSet)=="Species"] <- "Plants"

Create new Column

Name new column = Area

# Petal.Area 
DataSet$Petal.Area <- DataSet$Petal.Length * DataSet$Petal.Width

Plot Correlation Matrix HeatMap

# Set "Plants" as numeric
DataSet$Plants<- as.numeric(DataSet$Plants)
# Set data set
DataCorrelation <- DataSet[1:4]
# Correlation
Correlation <- round(cor(DataCorrelation),2)
head(Correlation)
##              Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length         1.00       -0.09         0.85        0.80
## Sepal.Width         -0.09        1.00        -0.38       -0.31
## Petal.Length         0.85       -0.38         1.00        0.94
## Petal.Width          0.80       -0.31         0.94        1.00
melted_correlation <- melt(Correlation)
head(melted_correlation)
##           Var1         Var2 value
## 1 Sepal.Length Sepal.Length  1.00
## 2  Sepal.Width Sepal.Length -0.09
## 3 Petal.Length Sepal.Length  0.85
## 4  Petal.Width Sepal.Length  0.80
## 5 Sepal.Length  Sepal.Width -0.09
## 6  Sepal.Width  Sepal.Width  1.00
plot <- ggplot(melted_correlation, aes(x=Var1, y=Var2, fill=value, label= value)) + geom_tile() + geom_text() + scale_fill_gradient2(low = "#132B43",high ="#56B1F7",mid = "white") + geom_label()
plot  

Plot Scatter Plot

Plot Petal.Width vs Petal.Length and different colours according to species

ggplot(DataSet, aes(x=Petal.Width, y=Petal.Length, col = Plants)) + geom_point()

Plot Histogram

hist(DataSet$Sepal.Length)