1 Goal


The goal of this tutorial is to understand the different parameters that can help us when we read a csv file.


2 Separator


# In this tutorial we are going to work with the iris plant dataset
# However we have done a few modifications to the dataset
# Let's try to open the file

Iris_dataset <- read.csv("iris_NA.csv")
str(Iris_dataset)
## 'data.frame':    150 obs. of  1 variable:
##  $ Sepal.Length.Sepal.Width.Petal.Length.Petal.Width.Species: Factor w/ 149 levels ";2.5;4.5;1.7;virginica",..: 38 23 14 10 34 53 12 31 6 24 ...
head(Iris_dataset)
##   Sepal.Length.Sepal.Width.Petal.Length.Petal.Width.Species
## 1                                    5.1;3.5;1.4;0.2;setosa
## 2                                      4.9;3;1.4;0.2;setosa
## 3                                    4.7;3.2;1.3;0.2;setosa
## 4                                    4.6;3.1;1.5;0.2;setosa
## 5                                      5;3.6;1.4;0.2;setosa
## 6                                    5.4;3.9;1.7;0.4;setosa
# Notice that it looks different to the original data
data("iris")
head(iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
# If we look closely we find that instead of comma the file uses ";" as column separator
# We can define the separator of a csv file with the parameter sep=
Iris_dataset <- read.csv("iris_NA.csv", sep = ";")

# Now the dataframe contains all variables
head(Iris_dataset)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9           3          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5            5         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa

4 Avoid reading columns as factors


# However we have introduced some missing values
str(Iris_dataset)
## 'data.frame':    151 obs. of  5 variables:
##  $ V1: Factor w/ 38 levels "","?","4.3","4.4",..: 38 11 9 7 6 10 14 6 10 4 ...
##  $ V2: Factor w/ 25 levels "?","2","2.2",..: 25 16 11 13 12 17 20 15 15 10 ...
##  $ V3: Factor w/ 45 levels "?","1","1.1",..: 45 6 6 5 7 6 9 6 7 6 ...
##  $ V4: Factor w/ 23 levels "0.1","0.2","0.3",..: 23 2 2 2 2 2 4 3 2 2 ...
##  $ V5: Factor w/ 5 levels "?","setosa","Species",..: 3 2 2 2 2 2 2 2 2 2 ...
# The first thing we notice is that numerical variables have been turned into factors
summary(Iris_dataset)
##        V1            V2           V3           V4              V5    
##  5      :  9   3      :26   1.4    :13   0.2    :29   ?         : 2  
##  6.3    :  9   2.8    :14   1.5    :12   1.3    :13   setosa    :49  
##  5.1    :  8   3.2    :13   4.5    : 8   1.5    :12   Species   : 1  
##  6.7    :  8   3.4    :12   5.1    : 8   1.8    :12   versicolor:50  
##  5.5    :  7   2.9    :10   1.3    : 7   1.4    : 8   virginica :49  
##  5.7    :  7   3.1    :10   1.6    : 7   2.3    : 8                  
##  (Other):103   (Other):66   (Other):96   (Other):69
# This is because we have introduced the missing values as "?", so R reads all the column as a character
# It is possible to find numerical columns with NAs in text form like: "NA", "?", "-", " ", etc
# If a numerical value contains a dollar or euro sign or percentage, the effect will be the same
# The first thing we can do is to read all columns as characters avoiding creating factors
Iris_dataset <- read.csv("iris_NA.csv",sep =";", stringsAsFactors = FALSE)
str(Iris_dataset)
## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: chr  "5.1" "4.9" "4.7" "4.6" ...
##  $ Sepal.Width : chr  "3.5" "3" "3.2" "3.1" ...
##  $ Petal.Length: chr  "1.4" "1.4" "1.3" "1.5" ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : chr  "setosa" "setosa" "setosa" "setosa" ...
# Much better

5 Reading the missing values


# We have avoided reading everything as factors because of the missing values
# Now we can try to identify those missing values
# We can do so using the function na.strings

Iris_dataset <- read.csv("iris_NA.csv",sep =";", na.strings ="?", stringsAsFactors = FALSE)

# Remember that we could find different missing value characters inside the same dataset
# In this case we should use a vector as an input na.strings = c("?", "NA", etc)
str(Iris_dataset)
## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : chr  "setosa" "setosa" "setosa" "setosa" ...
summary(Iris_dataset)
##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.862   Mean   :3.048   Mean   :3.761   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##  NA's   :4       NA's   :3       NA's   :2                      
##    Species         
##  Length:150        
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
## 
# However we want the Species to be a factor
Iris_dataset$Species <- factor(Iris_dataset$Species)
str(Iris_dataset)
## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
summary(Iris_dataset)
##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.862   Mean   :3.048   Mean   :3.761   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##  NA's   :4       NA's   :3       NA's   :2                      
##        Species  
##  setosa    :49  
##  versicolor:50  
##  virginica :49  
##  NA's      : 2  
##                 
##                 
## 

6 Conclusion


In this tutorial we have learnt how to read a csv file using properly the different options. Sometimes reading a file is a matter of trial and error but following these steps most of the problems can be solved.