Data Wrangling Practice in RStudio using R

Step 1: Setting Up Your R Markdown File

# Open RStudio.
# Go to File > New File > R Markdown....
# Enter a title (e.g., "Data Wrangling Practice").
# Choose "HTML" as the output format (you can change it later if needed).
# Click OK.

Step 2: Importing Your CSV File

# Load the CSV file from your Desktop's Data Wrangling Practice folder
customers <- read.csv("C:\\Users\\Rehan\\Desktop\\Data Wrangling Practice\\customers-10000.csv")

# View the first few rows of the dataset
head(customers)

Step 3: Inspecting the Data

# Check the structure of the dataset
str(customers)
## 'data.frame':    10000 obs. of  12 variables:
##  $ Index            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Customer.Id      : chr  "EB54EF1154C3A78" "10dAcafEBbA5FcA" "67DAB15Ebe4BE4a" "6d350C5E5eDB4EE" ...
##  $ First.Name       : chr  "Heather" "Kristina" "Briana" "Patty" ...
##  $ Last.Name        : chr  "Callahan" "Ferrell" "Andersen" "Ponce" ...
##  $ Company          : chr  "Mosley-David" "Horn, Shepard and Watson" "Irwin-Oneal" "Richardson Group" ...
##  $ City             : chr  "Lake Jeffborough" "Aaronville" "East Jordan" "East Kristintown" ...
##  $ Country          : chr  "Norway" "Andorra" "Nepal" "Northern Mariana Islands" ...
##  $ Phone.1          : chr  "043-797-5229" "932-062-1802" "8352752061" "302.398.3833" ...
##  $ Phone.2          : chr  "915.112.1727" "(209)172-7124x3651" "(567)135-1918" "196-189-7767x770" ...
##  $ Email            : chr  "urangel@espinoza-francis.net" "xreese@hall-donovan.com" "haleybraun@blevins-sexton.com" "hohailey@anthony.com" ...
##  $ Subscription.Date: chr  "2020-08-26" "2020-04-27" "2022-03-22" "2020-07-02" ...
##  $ Website          : chr  "http://www.escobar.org/" "https://tyler-pugh.info/" "https://www.mack-bell.net/" "https://delacruz-freeman.org/" ...
# Get a summary of the dataset
summary(customers)
##      Index       Customer.Id         First.Name         Last.Name        
##  Min.   :    1   Length:10000       Length:10000       Length:10000      
##  1st Qu.: 2501   Class :character   Class :character   Class :character  
##  Median : 5000   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 5000                                                           
##  3rd Qu.: 7500                                                           
##  Max.   :10000                                                           
##    Company              City             Country            Phone.1         
##  Length:10000       Length:10000       Length:10000       Length:10000      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##    Phone.2             Email           Subscription.Date    Website         
##  Length:10000       Length:10000       Length:10000       Length:10000      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
## 
# View the names of the columns
colnames(customers)
##  [1] "Index"             "Customer.Id"       "First.Name"       
##  [4] "Last.Name"         "Company"           "City"             
##  [7] "Country"           "Phone.1"           "Phone.2"          
## [10] "Email"             "Subscription.Date" "Website"

Step 4: Select Specific Columns

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# Select specific columns
customers_selected <- customers %>%
  select(Customer.Id, First.Name, Email)

# View the updated dataset
head(customers_selected)

Step 5: Filter Rows Based on Conditions

# Filter customers from Norway
customers_norway <- customers %>%
  filter(Country == "Norway")

# View the filtered data
head(customers_norway)

Step 6: Create a New Column

# Create a new column for full name
customers_fullname <- customers %>%
  mutate(FullName = paste(First.Name, Last.Name))

# View the updated data
head(customers_fullname)

##Step 7: Handling Missing Data

# Remove rows with missing values
library(tidyr)

customers_clean <- customers %>%
  drop_na()

# Alternatively, replace missing phone numbers with "Unknown"
customers_filled <- customers %>%
  mutate(Phone.1 = replace_na(Phone.1, "Unknown"))

Step 8: Group and Summarize Data

# Group by country and count the number of customers
customers_by_country <- customers %>%
  group_by(Country) %>%
  summarize(NumCustomers = n())

# View the summarized data
customers_by_country

Step 9: Sort Data

# Sort by the number of customers in descending order
customers_sorted <- customers_by_country %>%
  arrange(desc(NumCustomers))

# View the sorted data
customers_sorted

Step 10: Export the Cleaned Data

# Export the cleaned data to a CSV file
write.csv(customers_clean, "C:\\Users\\Rehan\\Desktop\\Data Wrangling Practice\\cleaned_customers.csv")

Step 11: Knit Your R Markdown

View(customers_clean)