Investing.com with R

02/11/2024 Yufy Firdiansyah, MM, S.Sos

The Investing.com Historical Data requires further cleaning to ensure data accuracy and usability. Specifically, the Date and Close columns contain entries that are not yet available, necessitating attention. Additionally, the Volume column needs to be processed to convert values representing billions (B), millions (M), and thousands (K) into their respective numerical values, with B representing 1,000,000,000, M representing 1,000,000, and K representing 1,000. The following code provides a step-by-step approach to achieve this data cleaning, ensuring that the dataset is ready for effective analysis.

Libraries & File

# Define a function to check for and install required packages
install_if_missing <- function(packages) {
  for (pkg in packages) {
    if (!requireNamespace(pkg, quietly = TRUE)) {
      install.packages(pkg)
    }
    library(pkg, character.only = TRUE)  # Load the package after installation
  }
}

# List the required packages
required_packages <- c("dplyr", "stringr", "lubridate")

# Call the function to ensure all packages are installed and loaded
install_if_missing(required_packages)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
# You can now proceed with your data processing using dplyr, stringr, and lubridate.
# Activating Libraries & reset the environment

library(dplyr)
library(stringr)
library(lubridate)
#Set Directory and Upload The File
setwd("~/RProject/Stocks_2/Data")
ADRO <- read.csv("ADRO Historical Data.csv")
head(ADRO)
##         Date Price  Open  High   Low    Vol. Change..
## 1 10/27/2024 3,780 3,690 3,780 3,580 217.64M    3.28%
## 2 10/20/2024 3,660 3,790 3,800 3,590 278.78M   -3.17%
## 3 10/13/2024 3,780 3,870 3,920 3,730 332.83M   -2.33%
## 4 10/06/2024 3,870 3,890 3,930 3,760 281.50M    1.57%
## 5 09/29/2024 3,810 3,910 3,930 3,710 325.66M   -2.56%
## 6 09/22/2024 3,910 3,720 3,940 3,680 454.27M    5.96%

Cleaning

#Tidy Up Column Name
column_names_1 <- colnames(ADRO)
column_names_1 <- trimws(colnames(ADRO))

head(ADRO)
##         Date Price  Open  High   Low    Vol. Change..
## 1 10/27/2024 3,780 3,690 3,780 3,580 217.64M    3.28%
## 2 10/20/2024 3,660 3,790 3,800 3,590 278.78M   -3.17%
## 3 10/13/2024 3,780 3,870 3,920 3,730 332.83M   -2.33%
## 4 10/06/2024 3,870 3,890 3,930 3,760 281.50M    1.57%
## 5 09/29/2024 3,810 3,910 3,930 3,710 325.66M   -2.56%
## 6 09/22/2024 3,910 3,720 3,940 3,680 454.27M    5.96%
#Change The Vol. Column to Volume

ADRO <-ADRO %>%
rename(Volume = 'Vol.')
head(ADRO)
##         Date Price  Open  High   Low  Volume Change..
## 1 10/27/2024 3,780 3,690 3,780 3,580 217.64M    3.28%
## 2 10/20/2024 3,660 3,790 3,800 3,590 278.78M   -3.17%
## 3 10/13/2024 3,780 3,870 3,920 3,730 332.83M   -2.33%
## 4 10/06/2024 3,870 3,890 3,930 3,760 281.50M    1.57%
## 5 09/29/2024 3,810 3,910 3,930 3,710 325.66M   -2.56%
## 6 09/22/2024 3,910 3,720 3,940 3,680 454.27M    5.96%
#Add Close Column to data frame

ADRO <-ADRO %>%
  mutate(Close = lead(Open))
head(ADRO)
##         Date Price  Open  High   Low  Volume Change.. Close
## 1 10/27/2024 3,780 3,690 3,780 3,580 217.64M    3.28% 3,790
## 2 10/20/2024 3,660 3,790 3,800 3,590 278.78M   -3.17% 3,870
## 3 10/13/2024 3,780 3,870 3,920 3,730 332.83M   -2.33% 3,890
## 4 10/06/2024 3,870 3,890 3,930 3,760 281.50M    1.57% 3,910
## 5 09/29/2024 3,810 3,910 3,930 3,710 325.66M   -2.56% 3,720
## 6 09/22/2024 3,910 3,720 3,940 3,680 454.27M    5.96% 3,670
#Delete the latest row because it dows not have Close Column
ADRO <- na.omit(ADRO)
head(ADRO)
##         Date Price  Open  High   Low  Volume Change.. Close
## 1 10/27/2024 3,780 3,690 3,780 3,580 217.64M    3.28% 3,790
## 2 10/20/2024 3,660 3,790 3,800 3,590 278.78M   -3.17% 3,870
## 3 10/13/2024 3,780 3,870 3,920 3,730 332.83M   -2.33% 3,890
## 4 10/06/2024 3,870 3,890 3,930 3,760 281.50M    1.57% 3,910
## 5 09/29/2024 3,810 3,910 3,930 3,710 325.66M   -2.56% 3,720
## 6 09/22/2024 3,910 3,720 3,940 3,680 454.27M    5.96% 3,670
#Convert Date Column to Date

ADRO$Date <- parse_date_time(ADRO$Date, orders = c("ymd", "dmy", "mdy"), quiet = TRUE)

head(ADRO)
##         Date Price  Open  High   Low  Volume Change.. Close
## 1 2024-10-27 3,780 3,690 3,780 3,580 217.64M    3.28% 3,790
## 2 2024-10-20 3,660 3,790 3,800 3,590 278.78M   -3.17% 3,870
## 3 2024-10-13 3,780 3,870 3,920 3,730 332.83M   -2.33% 3,890
## 4 2024-10-06 3,870 3,890 3,930 3,760 281.50M    1.57% 3,910
## 5 2024-09-29 3,810 3,910 3,930 3,710 325.66M   -2.56% 3,720
## 6 2024-09-22 3,910 3,720 3,940 3,680 454.27M    5.96% 3,670
# Function to clean and convert Volume column
clean_volume <- function(volume) {
  if (grepl('M', volume)) {
    return(as.numeric(gsub('M', '', volume)) * 1000000)
  } else if (grepl('B', volume)) {
    return(as.numeric(gsub('B', '', volume)) * 1000000000)
  } else if (grepl('K', volume)) {
    return(as.numeric(gsub('K', '', volume)) * 1000)
  } else {
    return(as.numeric(volume))
  }
}


# Apply the cleaning function to the Volume column
ADRO$Volume <- sapply(ADRO$Volume, clean_volume)



# Display the head of the modified dataframe
print(head(ADRO))
##         Date Price  Open  High   Low    Volume Change.. Close
## 1 2024-10-27 3,780 3,690 3,780 3,580 217640000    3.28% 3,790
## 2 2024-10-20 3,660 3,790 3,800 3,590 278780000   -3.17% 3,870
## 3 2024-10-13 3,780 3,870 3,920 3,730 332830000   -2.33% 3,890
## 4 2024-10-06 3,870 3,890 3,930 3,760 281500000    1.57% 3,910
## 5 2024-09-29 3,810 3,910 3,930 3,710 325660000   -2.56% 3,720
## 6 2024-09-22 3,910 3,720 3,940 3,680 454270000    5.96% 3,670
# Convert Open, High, Low, Close columns to numeric
# Function to clean and convert to numeric
clean_and_convert_numeric <- function(column) {
  # Remove any non-numeric characters (keeping dots for decimals)
  cleaned_column <- gsub("[^0-9.]", "", column)
  numeric_column <- as.numeric(cleaned_column)
  
  # Print out problematic entries that became NA
  na_entries <- column[is.na(numeric_column)]
  if (length(na_entries) > 0) {
    cat("Problematic entries in original column:", na_entries, "\n")
  }
  
  return(numeric_column)
}

# Use mutate to convert columns
ADRO <- ADRO %>%
  mutate(
    Price = clean_and_convert_numeric(Price),
    Open = clean_and_convert_numeric(Open),
    High = clean_and_convert_numeric(High),
    Low = clean_and_convert_numeric(Low),
    Close = clean_and_convert_numeric(Close),
    Change = clean_and_convert_numeric(Change..)
  )
print(head(ADRO))
##         Date Price Open High  Low    Volume Change.. Close Change
## 1 2024-10-27  3780 3690 3780 3580 217640000    3.28%  3790   3.28
## 2 2024-10-20  3660 3790 3800 3590 278780000   -3.17%  3870   3.17
## 3 2024-10-13  3780 3870 3920 3730 332830000   -2.33%  3890   2.33
## 4 2024-10-06  3870 3890 3930 3760 281500000    1.57%  3910   1.57
## 5 2024-09-29  3810 3910 3930 3710 325660000   -2.56%  3720   2.56
## 6 2024-09-22  3910 3720 3940 3680 454270000    5.96%  3670   5.96

Write

#Write the File
write.csv(ADRO, file = "ADROcleaned.csv", row.names = FALSE)

Conclusion

In conclusion, the Investing.com Historical Data requires necessary cleaning to ensure its reliability and accuracy, particularly in the Date, Close, and Volume columns. By systematically addressing these issues, the dataset will be optimized for effective analysis and investment decision-making.