# Open RStudio.
# Go to File > New File > R Markdown....
# Enter a title (e.g., "Data Wrangling Practice").
# Choose "HTML" as the output format (you can change it later if needed).
# Click OK.
# Load the CSV file from your Desktop's Data Wrangling Practice folder
customers <- read.csv("C:\\Users\\Rehan\\Desktop\\Data Wrangling Practice\\customers-10000.csv")
# View the first few rows of the dataset
head(customers)
# Check the structure of the dataset
str(customers)
## 'data.frame': 10000 obs. of 12 variables:
## $ Index : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Customer.Id : chr "EB54EF1154C3A78" "10dAcafEBbA5FcA" "67DAB15Ebe4BE4a" "6d350C5E5eDB4EE" ...
## $ First.Name : chr "Heather" "Kristina" "Briana" "Patty" ...
## $ Last.Name : chr "Callahan" "Ferrell" "Andersen" "Ponce" ...
## $ Company : chr "Mosley-David" "Horn, Shepard and Watson" "Irwin-Oneal" "Richardson Group" ...
## $ City : chr "Lake Jeffborough" "Aaronville" "East Jordan" "East Kristintown" ...
## $ Country : chr "Norway" "Andorra" "Nepal" "Northern Mariana Islands" ...
## $ Phone.1 : chr "043-797-5229" "932-062-1802" "8352752061" "302.398.3833" ...
## $ Phone.2 : chr "915.112.1727" "(209)172-7124x3651" "(567)135-1918" "196-189-7767x770" ...
## $ Email : chr "urangel@espinoza-francis.net" "xreese@hall-donovan.com" "haleybraun@blevins-sexton.com" "hohailey@anthony.com" ...
## $ Subscription.Date: chr "2020-08-26" "2020-04-27" "2022-03-22" "2020-07-02" ...
## $ Website : chr "http://www.escobar.org/" "https://tyler-pugh.info/" "https://www.mack-bell.net/" "https://delacruz-freeman.org/" ...
# Get a summary of the dataset
summary(customers)
## Index Customer.Id First.Name Last.Name
## Min. : 1 Length:10000 Length:10000 Length:10000
## 1st Qu.: 2501 Class :character Class :character Class :character
## Median : 5000 Mode :character Mode :character Mode :character
## Mean : 5000
## 3rd Qu.: 7500
## Max. :10000
## Company City Country Phone.1
## Length:10000 Length:10000 Length:10000 Length:10000
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## Phone.2 Email Subscription.Date Website
## Length:10000 Length:10000 Length:10000 Length:10000
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
# View the names of the columns
colnames(customers)
## [1] "Index" "Customer.Id" "First.Name"
## [4] "Last.Name" "Company" "City"
## [7] "Country" "Phone.1" "Phone.2"
## [10] "Email" "Subscription.Date" "Website"
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Select specific columns
customers_selected <- customers %>%
select(Customer.Id, First.Name, Email)
# View the updated dataset
head(customers_selected)
# Filter customers from Norway
customers_norway <- customers %>%
filter(Country == "Norway")
# View the filtered data
head(customers_norway)
# Create a new column for full name
customers_fullname <- customers %>%
mutate(FullName = paste(First.Name, Last.Name))
# View the updated data
head(customers_fullname)
##Step 7: Handling Missing Data
# Remove rows with missing values
library(tidyr)
customers_clean <- customers %>%
drop_na()
# Alternatively, replace missing phone numbers with "Unknown"
customers_filled <- customers %>%
mutate(Phone.1 = replace_na(Phone.1, "Unknown"))
# Group by country and count the number of customers
customers_by_country <- customers %>%
group_by(Country) %>%
summarize(NumCustomers = n())
# View the summarized data
customers_by_country
# Sort by the number of customers in descending order
customers_sorted <- customers_by_country %>%
arrange(desc(NumCustomers))
# View the sorted data
customers_sorted
# Export the cleaned data to a CSV file
write.csv(customers_clean, "C:\\Users\\Rehan\\Desktop\\Data Wrangling Practice\\cleaned_customers.csv")
View(customers_clean)