GIT Report

2025-04-30

Graduate Internship Training Progress Report

Start date: September, 2024

Completed course(s)

R tutorial for beginners 2022; r programming full course in 7 hours- r tutorial simplilearn.

Variables in R
Data types in R
Logical operators
Print formatting
Vectors

v1 <- c(1,2,3,4,5)
V2 <- c("red", "green", "yellow")
V2

## [1] "red"    "green"  "yellow"

completed courses cont’d

List

List_1 <- list(x=c(10,20,30),
Y=c("a", "b", "c"),
Z=c(TRUE, FALSE))
List_1

## $x
## [1] 10 20 30
## 
## $Y
## [1] "a" "b" "c"
## 
## $Z
## [1]  TRUE FALSE

Matrix

M <- matrix(1:9, nrow=3,
ncol=3, byrow=T)
M

##      [,1] [,2] [,3]
## [1,]    1    2    3
## [2,]    4    5    6
## [3,]    7    8    9

Data frame

BMI <- data.frame(
Gender = c("Male", "Male", "Female"),
Height = c(152, 171.5, 165),
Weight = c(81, 93, 78),
Age = c(42, 38,26))

BMI

##   Gender Height Weight Age
## 1   Male  152.0     81  42
## 2   Male  171.5     93  38
## 3 Female  165.0     78  26

If statement in R

X <- 30L 
  if(is.integer(X)) {
    print("x is an Integer")
}

## [1] "x is an Integer"

While and for loop in R

V <- c("Hello World")
        count <- 2
        while (count < 7){
print(V)
count = count + 3}

## [1] "Hello World"
## [1] "Hello World"

fruit <- c("Apple", "Orange", "Passion fruit", "Banana")
    for (i in fruit){
print(i)}

## [1] "Apple"
## [1] "Orange"
## [1] "Passion fruit"
## [1] "Banana"

Functions in R

I <- function(S,a,b,C){
  C-S+(a/b)*log(S)
}

Regular expressions in R used for pattern matching{grepl() and grep()}
Built-in R functions
- seq(); creating sequences
- sort(); rearranging sequences
- rev(); reversing elements in R objects
- append(); combining objects in R
Data manipulation in R;dplyr, tidyr.
- etc

CS50’s Introduction to Programming with R

Representing data; including how to input data in R, where to place this data, how to prepare data from further analysis.
Transforming data; which includes adjusting the arrangement or view of the data-set.
Applying functions; applying a formula or an equation to an object using “function” so experiments can be carried out by only calling the object that is now a function.
Tidying Data; involves the cleaning and preparation process of a data-set before further analysis.
Visualizing Data; involves creating visual representation of the data to allow for easy understanding and explanation.

Projects and tasks embarked on

Prediction of FUOYE Enrollment given by Prof. Bakare

Data visualization

library(tidyverse)
library(dplyr)
library(ggplot2)

fouye <- read.csv("C:\\Users\\hp\\Desktop\\RRR\\FUOYE ENROLLMENT.csv")
fouye <- subset(fouye, !(TOTAL.ENROLLED == "NA"))

fouye_plots_social <- fouye[,(c(1,14))]
fou14 <- ggplot(fouye_plots_social, aes(x=YEAR, y=SOCIAL.SCIENCES))+
  geom_bar(stat = "identity", aes(fill = SOCIAL.SCIENCES))+
  theme_minimal()+
  scale_x_continuous(limits=c(2010,2025))

fou14

Growth rate(and visualization)

fouye_plots_total <- fouye[,(c(1,15))]
fouye_total_growth <- fouye_plots_total|>
  arrange(YEAR)|>
  mutate(growth_rate=(TOTAL.ENROLLED/lag(TOTAL.ENROLLED)-1)*100)
head(fouye_total_growth, 3)

##   YEAR TOTAL.ENROLLED growth_rate
## 1 2011            423          NA
## 2 2012           1144   170.44917
## 3 2013           1480    29.37063

Future prediction with lower and upper limits(linear models with visualization)

fouye_model <- lm(TOTAL.ENROLLED~ YEAR, data = fouye_plots_total)

new_fouye_years <- data.frame(YEAR=c(2024,2025,2026,2027,2028,2029,2030,2031,
                                     2032,2033,2034,2035))

predictions <- predict(fouye_model, newdata = new_fouye_years)

model_results <- data.frame(YEAR=new_fouye_years$YEAR,
                            TOTAL.ENROLLED=predictions
                            )
head(model_results, 5)

##   YEAR TOTAL.ENROLLED
## 1 2024       41428.14
## 2 2025       45075.27
## 3 2026       48722.39
## 4 2027       52369.51
## 5 2028       56016.63

The growth rate plot

growth_rater <- ggplot(fouye_total_growth)+
  #bar chart
  geom_bar(aes(x=YEAR,y=TOTAL.ENROLLED),stat = "identity", fill="steelblue")+
  #line graph for growth
  geom_line(aes(x=YEAR, y=growth_rate*max(TOTAL.ENROLLED)/max(growth_rate, na.rm = TRUE)),
            color="red", size=1,group=1)+
  geom_point(aes(x=YEAR,y=growth_rate*max(TOTAL.ENROLLED)/max(growth_rate, na.rm = TRUE)),
             color="darkred", size=3)+
  scale_y_continuous(name = "TOTAL.ENROLLED",
                     #SECONDARY AXIS
                     sec.axis = sec_axis(~.*max(fouye_total_growth$growth_rate,na.rm = T)/
                                           max(fouye_total_growth$TOTAL.ENROLLED),
                                         name = "Growth Rate (%)"))+
  theme_minimal()+
  theme(
    axis.title.y.left = element_text(color = "steelblue"),
    axis.title.y.right = element_text(color = "red")
  )+labs(title = "Enrollment Over Time With Growth Rate", x="Year")+
  scale_x_continuous(limits = c(2010,2025))

growth_rater

Cognifyz Technologies Internship program

Data manipulation; stringr, dplyr, tidyr
- Selecting columns needed for specific tasks

cognify <- read.csv("C:\\Users\\hp\\Desktop\\RRR\\cognify.csv")

city_rate <- cognify[,as.double(c(4,18))]
head(city_rate, 3)

##               City Aggregate.rating
## 1      Makati City              4.8
## 2      Makati City              4.5
## 3 Mandaluyong City              4.4

For tasks on that average rating for each city, only two columns were needed; the city and aggregate rating columns

Comparison based on a column of TRUE or FALSE

online_del <- cognify[,c(14,18)]
total_restau <- nrow(cognify)
percent_online <-   sum     (((online_del$Has.Online.delivery=="Yes")
                          /total_restau)*100)
percent_online

## [1] 25.66223

Analysis of cuisine names and determining combinations

agg_cognify <- cognify|>
  group_by(Cuisines)|>
  summarise(
    avg_rating=mean(Aggregate.rating, na.rm=TRUE),
    count=n())|>
  arrange(desc(avg_rating))

head(agg_cognify, 2)

## # A tibble: 2 × 3
##   Cuisines                avg_rating count
##   <chr>                        <dbl> <int>
## 1 American, BBQ, Sandwich        4.9     1
## 2 American, Burger, Grill        4.9     1

Analysis of restaurant names and determining chains

chains_res <- cognify[,c(2,18,21)]
chainss <- chains_res|>group_by(Restaurant.Name)|>
        summarise(no=n())|>arrange(desc(no))
head(chainss, 2)

## # A tibble: 2 × 2
##   Restaurant.Name    no
##   <chr>           <int>
## 1 Cafe Coffee Day    83
## 2 Domino's Pizza     79

Review text analysis

rate_text <- cognify[,c(2,18,20)]
rate_count <- rate_text|>
        group_by(Rating.text)|>
        summarise(no=n())
rate_count

## # A tibble: 6 × 2
##   Rating.text    no
##   <chr>       <int>
## 1 Average      3737
## 2 Excellent     301
## 3 Good         2100
## 4 Not rated    2148
## 5 Poor          186
## 6 Very Good    1079

rate_text <- rate_text|>
            mutate(Review_length=nchar(Rating.text))

head(rate_text, 3)

##          Restaurant.Name Aggregate.rating Rating.text Review_length
## 1       Le Petit Souffle              4.8   Excellent             9
## 2       Izakaya Kikufuji              4.5   Excellent             9
## 3 Heat - Edsa Shangri-La              4.4   Very Good             9

Data Visualization; ggplot2, leaflet(maps)
- Histogram to show price distribution of restaurants

price_range <- cognify[, as.numeric(c(2,17))]
price_range <- price_range|>
  group_by(Price.range)|>
  arrange(desc(Price.range))
pr <- ggplot(price_range, aes(x=Price.range))+
  geom_histogram(fill="darkorange")+
  theme_minimal()

pr

Interactive world map to show location of restaurants, with names and rating plus clustering

cognify_map <- cognify[,c(2,8,9,18)]
world <- map_data("world")

map1 <- ggplot()+
  geom_polygon(data = world, aes(x=long, y=lat, group = group),
               fill="lightblue", color="darkblue")+
  geom_point(data=cognify_map, aes(x=Longitude, y=Latitude,
                                   fill=Aggregate.rating,
                                   colour = "darkred"), size=3)+
  geom_text(data = cognify_map, aes(x=Longitude, y=Latitude,
                                    label = Restaurant.Name),
            vjust=-1, hjust=0.5, size=3, fontface="bold")+
  coord_fixed(1.3)+
  labs(title = "Restaurant Locations", x="Longitude", y="Latitude")+
  theme_minimal()

map1

The ggplot2 map could not deal with the clustering of the restaurant names on the map which caused pivoting to the “leaflet” package

library(leaflet)
interact_map <- leaflet(cognify_map) %>%
    addTiles() %>%
    addMarkers(
            lng = ~Longitude, 
            lat = ~Latitude, 
            popup = ~paste(
         "<b>Restaurant:</b>",Restaurant.Name,"<br>",
          "<b>Rating:</b>",Aggregate.rating,"<br>"
          ),  # Show restaurant name on click
    clusterOptions = markerClusterOptions()  # Enables clustering
  )

interact_map

Chisq.test
- Determining if the price ranges are affected by different factors e.g Online delivery

online_dev <- cognify[,c(2,13,14,17)]
chisq.test(table(online_dev$Price.range, online_dev$Has.Online.delivery))

## 
##  Pearson's Chi-squared test
## 
## data:  table(online_dev$Price.range, online_dev$Has.Online.delivery)
## X-squared = 721.38, df = 3, p-value < 2.2e-16

Certificate earned

Forage; Tata job simulation

Data visualizations and details needed for the executives of a business/company

library(dplyr)
library(stringr)
library(ggplot2)
library(anytime)
library(lubridate)
library(scales)
library(tidyr)
tata <- read.csv("C:\\Users\\hp\\Desktop\\RRR\\Online Retail.csv")

tata$Quantity <- as.numeric(tata$Quantity)
tata$UnitPrice <- as.numeric(tata$UnitPrice)
tata$Quantity[tata$Quantity < 1] <- NA
tata$UnitPrice[tata$UnitPrice < 0] <- NA

no_uk <- tata|>
  filter(Country != "United Kingdom")
no_uk <- no_uk[,c(4,6,8)]

no_uk_summary <- no_uk |>
  group_by(Country) |>
  summarise(
    Revenue = sum(Quantity*UnitPrice, na.rm = TRUE)
  )|>
  filter(Revenue!=0)

no_uk_quan <- no_uk[,c(1,3)]

no_uk_quan <- no_uk_quan|>
  group_by(Country)|>
  summarise(
    Quantities= sum(Quantity, na.rm = TRUE)
  )|>arrange(Country)|>
  filter(Quantities!=0)|>
  filter(Country!="")

no_uk_bar <- merge(no_uk_quan,no_uk_summary, by="Country")
no_uk_bar <- no_uk_bar|>
  arrange(desc(Revenue))
no_uk_bar_long <- pivot_longer(no_uk_bar,
  cols = c(Quantities,Revenue),
  names_to = "variable",
  values_to = "value"
)

ggplot(no_uk_bar_long, aes(x=reorder(Country,value), y=value, fill = variable))+
  geom_bar(stat = "identity", position="dodge", width = 0.7)+
  theme_minimal()+
  labs(title = "Side-by-side bar Plot",
       x="Country", y="Revenue", fill="Quantity")+
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))

Ongoing course(s)

Deep Learning in R

Topic(s) to cover
- Linear regression as shallow neural network
- Logistic regression as a neural network
- Artificial Neural Network (ANN)
- Deep Neural Network
- Bias and Variance in deep learning, etc.
Expected outcome
- Basic understanding of Artificial intelligence with R

Future direction

Finding relationships and uses of Artificial Intelligence with Data analysis and of course the application of these in the industrial chemistry world