Email : halake.kumar22@gmail.com

——————————————————————————————————————

——————————————————————————————————————

Creating the Connection

driver<- Oracle()
#driver <- dbDriver("Oracle")

 host <- "localhost"
 port <- 1521
 service <- "Orcl1"
 drv <- dbDriver("Oracle")

 connect.string <- paste(

 "(DESCRIPTION=",

 "(ADDRESS=(PROTOCOL=tcp)(HOST=", host, ")(PORT=", port, "))",

 "(CONNECT_DATA=(SERVICE_NAME=", service, ")))", sep = "")

 connection <- dbConnect(drv, username = "system", password = "Oracle_1", dbname = connect.string)
 
 
 #connection <- dbConnect(drv = driver, username = "system", password = "Oracle_1")

——————————————————————————————————————

Storing and removing some data in database

# Store first 10 rows of iris in Oracle database as an 'iris' table
dbWriteTable(connection, 'IRIS', iris[sample(nrow(iris),10,replace = T),], overwrite = TRUE, row.names = FALSE,col.names = FALSE)
[1] TRUE
# IMP : The Iris data has some columns of type double. When one see the same data in SQL Developer(What I used), the data totally looks different (eg 3.0 in r is 2.9999999 in SQL) So, manipulating may throw error sometimes in R

# If some queries are sending to same table from SQL developer and ROracle connection, sometimes we get error in R, so better to use one at time
 SELECT DISTINCT "Species" FROM IRIS
Species
versicolor
virginica
setosa
 SELECT * FROM IRIS
Displaying records 1 - 10
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
6.7 2.5 5.8 1.8 virginica
4.9 2.5 4.5 1.7 virginica
4.4 3.0 1.3 0.2 setosa
6.0 2.9 4.5 1.5 versicolor
5.4 3.9 1.7 0.4 setosa
5.1 3.5 1.4 0.3 setosa
6.7 2.5 5.8 1.8 virginica
4.9 3.0 1.4 0.2 setosa
6.7 2.5 5.8 1.8 virginica
5.6 2.7 4.2 1.3 versicolor
dbExistsTable(connection,'IRIS')  # Check whether iris table exists? -> TRUE / FALSE
[1] TRUE
dbReadTable(connection, "IRIS") %>% head(n = 10) # Read first 5 rows of iris table in database
   Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
1           6.7         2.5          5.8         1.8  virginica
2           4.9         2.5          4.5         1.7  virginica
3           4.4         3.0          1.3         0.2     setosa
4           6.0         2.9          4.5         1.5 versicolor
5           5.4         3.9          1.7         0.4     setosa
6           5.1         3.5          1.4         0.3     setosa
7           6.7         2.5          5.8         1.8  virginica
8           4.9         3.0          1.4         0.2     setosa
9           6.7         2.5          5.8         1.8  virginica
10          5.6         2.7          4.2         1.3 versicolor
dbRemoveTable(connection,'IRIS')  # Remove the table iris,runs only when no query is running like rs
[1] TRUE
dbExistsTable(connection,'IRIS')  # Whether iris exists yet?
[1] FALSE

——————————————————————————————————————

Importing the ODI data frame from Oracle to R.

Sometimes for any type of explorations (eg:Graphical), one may need to read the entire table from database. It can be achieved by dbReadTable. The table imported is in Dataframe format. The Field types are accordingly adjusted.

In oracle database the ODI dataset already exist.

ODI = dbReadTable(connection, "ODI")
glimpse(ODI)
Observations: 55,926
Variables: 7
$ COUNTRY   <chr> "Afghanistan", "Afghanistan", "Afghanistan", "Afghan...
$ PLAYER    <chr> "Mohammad Shahzad", "Mohammad Shahzad", "Mohammad Sh...
$ RUNS      <int> 118, 110, 100, 82, 57, 55, 37, 34, 20, 16, 11, 8, 2,...
$ SCORERATE <int> 98, 99, 139, 76, 100, 87, 80, 77, 91, 57, 100, 80, 5...
$ MATCHDATE <dttm> 2010-02-16, 2009-09-01, 2010-08-16, 2010-07-10, 201...
$ GROUND    <chr> "Sharjah CA Stadium", "VRA Ground", "Cambusdoon New ...
$ VERSUS    <chr> "Canada", "Netherlands", "Scotland", "Netherlands", ...
head(ODI,n=3)
      COUNTRY           PLAYER RUNS SCORERATE  MATCHDATE
1 Afghanistan Mohammad Shahzad  118        98 2010-02-16
2 Afghanistan Mohammad Shahzad  110        99 2009-09-01
3 Afghanistan Mohammad Shahzad  100       139 2010-08-16
                 GROUND      VERSUS
1    Sharjah CA Stadium      Canada
2            VRA Ground Netherlands
3 Cambusdoon New Ground    Scotland
object.size(ODI)
2821912 bytes
class(ODI)
[1] "data.frame"
SELECT TO_CHAR(MIN(MATCHDATE),'DDth-Month-YYYY') Begin,TO_CHAR(Max(MATCHDATE),'DDth-Month-YYYY') "End" FROM ODI
1 records
BEGIN End
05TH-January -1971 28TH-October -2011

——————————————————————————————————————

Some Commands from ROracle package

names(dbGetInfo(connection))
 [1] "username"      "dbname"        "serverVersion" "serverType"   
 [5] "resTotal"      "resOpen"       "prefetch"      "bulk_read"    
 [9] "bulk_write"    "stmt_cache"    "results"      
names(dbGetInfo(driver))
[1] "driverName"      "driverVersion"   "clientVersion"   "conTotal"       
[5] "conOpen"         "interruptible"   "unicode_as_utf8" "ora_attributes" 
[9] "connections"    
dbDataType(dbObj = connection,obj = 1:10) # (Dtatype of R object interms of SQL object)
[1] "INT"
dbDataType(dbObj = connection,obj = iris)
Sepal.Length  Sepal.Width Petal.Length  Petal.Width      Species 
    "DOUBLE"     "DOUBLE"     "DOUBLE"     "DOUBLE"       "TEXT" 
dbDataType(dbObj = connection,obj = "what is it?")
[1] "TEXT"
#dbDataType(dbObj = connection,obj = odi)
dbDataType(ANSI(), 1:5)
[1] "INT"
dbDataType(ANSI(), TRUE)
[1] "SMALLINT"
dbDataType(ANSI(), Sys.Date())
[1] "DATE"
dbDataType(ANSI(), list(raw(10), raw(20)))
[1] "BLOB"
dbDataType(ANSI(), I(3))
[1] "DOUBLE"
dbExistsTable(connection,"ODI") # Chech whether table exixts in 
[1] TRUE
dbExistsTable(connection,"Some_datatable")
[1] FALSE
dbListTables(connection) %>% tail(20) # Checking the list of Tables avaiable in Oracle DB.
 [1] "LOGSTDBY$APPLY_MILESTONE"  "LOGSTDBY$SCN"             
 [3] "LOGSTDBY$FLASHBACK_SCN"    "LOGSTDBY$PLSQL"           
 [5] "LOGSTDBY$SKIP_TRANSACTION" "LOGSTDBY$SKIP"            
 [7] "LOGSTDBY$SKIP_SUPPORT"     "LOGSTDBY$HISTORY"         
 [9] "LOGSTDBY$EDS_TABLES"       "DEF$_AQCALL"              
[11] "DEF$_AQERROR"              "AQ$_DEF$_AQCALL_F"        
[13] "AQ$DEF$_AQCALL"            "AQ$_DEF$_AQERROR_F"       
[15] "AQ$DEF$_AQERROR"           "PRODUCT_PRIVS"            
[17] "SQLPLUS_PRODUCT_PROFILE"   "HELP"                     
[19] "ODI"                       "mtcars"                   
dbListFields(connection,'ODI') # Colnames
[1] "COUNTRY"   "PLAYER"    "RUNS"      "SCORERATE" "MATCHDATE" "GROUND"   
[7] "VERSUS"   
dbListResults(connection) # history
list()
dbGetException(connection) # last error in the connection
$errorNum
[1] 0

$errorMsg
[1] ""
# dbGetInfo(connection)  # Connection details
# dbGetInfo(driver)  #
# dbListConnections(drv = driver) # Get host,port,SID in driver
# dbExecute  # -  number of rows affected by statements
# dbSendQuery
# dbGetQuery =  dbSendQuery() + dbFetch() + dbClearResult

# dbGetRowCount #  Number of rows fetched so far

# dbRemoveTable()
# dbSendStatement()  #  is for DML : UPDATE, DELETE, INSERT INTO, DROP TABLE, ...
# dbGetRowsAffected # - This method returns the number of rows that were added, deleted, or updated by a data manipulation statement

# dbWriteTable #  read data in Oracle database
# dbReadTable
# dbRemoveTable
# fetch  # fetching certain number of instances
# dbFetch
# dbHasCompleted #  This method returns if the operation has completed. A SELECT query is completed if all rows have been fetched. A data manipulation statement is always completed.

# dbIsValid # Is this DBMS object still valid?

# dbBegin # savepoint
# dbBreak
# dbCommit
# dbRollback
# dbWithTransaction() 
# sqlAppendTable #  Insert rows into a tabl
# sqlCreateTable #  Create a simple table
# sqlData #  Convert a data frame into form suitable for upload to an SQL database
# transactions # Begin/commit/rollback SQL transactions
# dbDisconnect()  # Disconnect R from Oracle database

Sometimes data may be having larger in size,importing a large volume of data in R is not recommended. So just querrying by letting it in database itself is a better choice.

——————————————————————————————————————

SQL Queries

number of matches

SELECT COUNT(*) FROM ODI /* number of matches */
COUNT(*)
55926

Countrywise average runs

SELECT COUNTRY, AVG(RUNS) as avg_runs FROM ODI GROUP BY COUNTRY
Displaying records 1 - 10
COUNTRY AVG_RUNS
Ireland 20.50801
Kenya 17.73057
South Africa 25.68711
West Indies 22.63222
Zimbabwe 19.79050
East Africa 9.90625
United Arab Emirates 14.07018
New Zealand 21.06679
Pakistan 22.71946
Scotland 16.20961

Merging fields

SELECT COUNTRY||AVG(RUNS) as Country_avg_Runs FROM ODI GROUP BY COUNTRY
COUNTRY_AVG_RUNS
Ireland20.5080128205128205128205128205128205128
Kenya17.7305737109658678286129266521423384169
South Africa25.6871118012422360248447204968944099379
West Indies22.6322232223222322232223222322232223222
Zimbabwe19.7904961932265686531898135993699133631
East Africa9.90625
United Arab Emirates14.0701754385964912280701754385964912281
New Zealand21.0667893284268629254829806807727690892
Pakistan22.7194604425583510154592300697180963929
Scotland16.2096069868995633187772925764192139738
SELECT GROUND||'-'||cOUNT(GROUND) FROM ODI GROUP BY GROUND
GROUND||‘-’||COUNT(GROUND)
Cambusdoon New Ground-100
OUTsurance Oval-409
Brisbane Cricket Ground-1184
Marrara Cricket Ground-72
Mahinda Rajapaksha International-76
Telstra Dome-225
Westpac Stadium-318
Nehru Stadium (Guwahati)-244
Kingsmead-642
Edgbaston-787
SELECT GROUND||'-'||cOUNT(GROUND) AS "GROUNDWISE COUNT" FROM ODI GROUP BY GROUND ORDER BY COUNT(GROUND) DESC
GROUNDWISE COUNT
Sharjah CA Stadium-3581
Sydney Cricket Ground-2462
Melbourne Cricket Ground-2308
R. Premadasa Stadium-1797
Harare Sports Club-1784
Adelaide Oval-1262
W.A.C.A. Ground-1222
Brisbane Cricket Ground-1184
Gymkhana Club Ground-1123
Eden Park-1100
SELECT COUNTRY ||' vs '|| VERSUS ||' at '|| GROUND ||' - ' || COUNT(GROUND) AS "Country-Versus-Ground-n" FROM ODI GROUP BY COUNTRY,VERSUS,GROUND ORDER BY COUNT(GROUND) DESC
Country-Versus-Ground-n
Sri Lanka vs Pakistan at Sharjah CA Stadium - 294
Pakistan vs Sri Lanka at Sharjah CA Stadium - 287
West Indies vs Australia at Melbourne Cricket Ground - 245
West Indies vs Australia at Sydney Cricket Ground - 241
Sri Lanka vs India at R. Premadasa Stadium - 241
India vs Sri Lanka at R. Premadasa Stadium - 239
Australia vs West Indies at Melbourne Cricket Ground - 234
Australia vs West Indies at Sydney Cricket Ground - 231
India vs Pakistan at Sharjah CA Stadium - 221
Pakistan vs India at Sharjah CA Stadium - 204

——————————————————————————————————————

Maximum runs by India versus all countries it played and the corresponding player

SELECT COUNTRY,VERSUS,PLAYER,RUNS FROM ODI WHERE RUNS IN 
(SELECT MAX(RUNS) AS RUNS FROM ODI WHERE COUNTRY='India' GROUP BY VERSUS) AND COUNTRY = 'India' ORDER BY RUNS DESC
Displaying records 1 - 10
COUNTRY VERSUS PLAYER RUNS
India South Africa Sachin R Tendulkar 200
India New Zealand Sachin R Tendulkar 186
India Sri Lanka Sourav C Ganguly 183
India Sri Lanka Mahendra S Dhoni 183
India Australia Sachin R Tendulkar 175
India Zimbabwe Kapil Dev 175
India Bangladesh Virender Sehwag 175
India Namibia Sachin R Tendulkar 152
India Pakistan Mahendra S Dhoni 148
India Zimbabwe Sachin R Tendulkar 146
  • Is it correct output ? Check how many opponent countries are there for India. They are 20 not 132
SELECT COUNT(*) FROM (SELECT COUNTRY,VERSUS,PLAYER,RUNS FROM ODI WHERE RUNS IN 
(SELECT MAX(RUNS) AS RUNS FROM ODI WHERE COUNTRY='India' GROUP BY VERSUS) AND COUNTRY = 'India' ORDER BY RUNS DESC)
COUNT(*)
132
SELECT COUNT(DISTINCT VERSUS) FROM ODI WHERE COUNTRY = 'India'
COUNT(DISTINCTVERSUS)
20
SELECT o.COUNTRY, o.VERSUS,o.MATCHDATE,o.PLAYER,o.RUNS FROM ODI o JOIN 
(SELECT COUNTRY,VERSUS, MAX(RUNS) AS highest_runs FROM ODI WHERE COUNTRY = 'India' GROUP BY COUNTRY,VERSUS) d 
ON o.COUNTRY=d.COUNTRY AND o.VERSUS = d.VERSUS AND o.RUNS = d.highest_runs ORDER BY o.RUNS DESC
Displaying records 1 - 10
COUNTRY VERSUS MATCHDATE PLAYER RUNS
India South Africa 2010-02-24 Sachin R Tendulkar 200
India New Zealand 1999-11-08 Sachin R Tendulkar 186
India Sri Lanka 1999-05-26 Sourav C Ganguly 183
India Sri Lanka 2005-10-31 Mahendra S Dhoni 183
India Australia 2009-11-05 Sachin R Tendulkar 175
India Zimbabwe 1983-06-18 Kapil Dev 175
India Bangladesh 2011-02-19 Virender Sehwag 175
India Namibia 2003-02-23 Sachin R Tendulkar 152
India Pakistan 2005-04-05 Mahendra S Dhoni 148
India Kenya 2001-10-24 Sachin R Tendulkar 146
  • 21 rows because same highest runs in 1999 and 2005 by Sourav C Ganguly and Mahendra S Dhoni respectively against Sri Lanka

——————————————————————————————————————

Maximum runs of each player and the country he belongs to

SELECT COUNTRY,PLAYER,MAX(RUNS) AS RUNS FROM ODI GROUP BY COUNTRY,PLAYER
Displaying records 1 - 10
COUNTRY PLAYER RUNS
Afghanistan Mohammad Nabi 62
Afghanistan Karim Sadiq 114
Afghanistan Aftab Alam 6
Australia Brett Lee 57
Australia John Dyson 79
Australia Bradley J Hodge 123
Australia Adam C Voges 80
Australia Michael J Di Venuto 89
Australia Brendon P Julian 35
Australia Julien M Wiener 50

Country ever having more runs than max runs by India

SELECT COUNTRY,RUNS FROM ODI WHERE RUNS > (SELECT MAX(RUNS) AS RUNS FROM ODI WHERE COUNTRY = 'India' GROUP BY COUNTRY) /* No country */

Table: 0 records

COUNTRY RUNS ——– —–

  • The same result is achieved by JOIN also. Joins are recommended over nested Subqueries like above
SELECT o.COUNTRY,O.RUNS FROM ODI o JOIN 
(SELECT COUNTRY,MAX(RUNS) AS max_run FROM ODI WHERE COUNTRY = 'India' GROUP BY COUNTRY) b
ON o.COUNTRY = b.COUNTRY AND o.RUNS >b.max_run
/* No country, by putting >=, will get India only */

Table: 0 records

COUNTRY RUNS ——– —–

——————————————————————————————————————

Score rate variation for different Countries

SELECT COUNTRY, stddev(SCORERATE) sd FROM ODI GROUP BY COUNTRY ORDER BY sd 
Displaying records 1 - 10
COUNTRY SD
East Africa 18.37641
Hong Kong 29.88540
United States 30.54528
United Arab Emirates 34.18290
Bangladesh 40.40986
Kenya 40.81031
Zimbabwe 41.98671
Sri Lanka 42.08557
Namibia 42.53942
England 42.73910

——————————————————————————————————————

R QUERIES

Fetch and save in R

Here, the previous SQL query output is saved in R as dataframe, so one can use to to explore in different ways like graphs.

df <- dbGetQuery(connection,"SELECT DISTINCT TO_CHAR(MATCHDATE,'YYYY') FROM ODI")
class(df)       # It is dataframe
[1] "data.frame"
typeof(df)
[1] "list"
colnames(df)  # default column name
[1] "TO_CHAR(MATCHDATE,'YYYY')"
class(df$`TO_CHAR(MATCHDATE,'YYYY')`)  # It is character
[1] "character"
head(df)
  TO_CHAR(MATCHDATE,'YYYY')
1                      2005
2                      2003
3                      2000
4                      2001
5                      1997
6                      1987
object.size(df)
2992 bytes

——————————————————————————————————————

Monthwise number of matches

  dbGetQuery(connection,statement = "SELECT TO_CHAR(MATCHDATE,'mm') MONTH FROM ODI") %>%
  ggplot() +
  geom_bar(aes(x = as.numeric(MONTH)),stat = "count",fill = rainbow(12),alpha = 0.6) +
  theme(panel.grid = element_blank(),panel.background = element_blank(),
        axis.ticks = element_blank(),axis.text.x = element_text(angle = 90))+
  scale_x_discrete(limits = 1:12, labels = month.name) +
  labs(y = "Number of matches", x = "Month", title = "Matches in each month")+
  theme(plot.title = element_text(face = "bold",color = "gray83",hjust = 0.5,size = 12))

——————————————————————————————————————

Number of matches per Year and month

dbGetQuery(connection,"SELECT TO_CHAR(MATCHDATE,'YYYY') YEAR, TO_CHAR(MATCHDATE,'MM') MONTH FROM ODI") %>%
  ggplot() +
  geom_bar(aes(x = YEAR,fill = as.factor(MONTH)), stat = "count",color = "purple",alpha = 0.9,size = 0.04) +
  theme(axis.text.x.top = element_text(angle = 90),
        panel.background = element_rect(fill = "gray80"),
        plot.background = element_rect(fill = "gray80"),
        axis.ticks = element_blank(),
        legend.background = element_rect(fill = "gray83"),
        panel.grid.major.y = element_blank(),
        panel.grid.minor.x = element_blank(),
        panel.grid.major.x = element_line(colour = "gray40",linetype = 3,lineend = 'butt')) +
  scale_y_continuous(expand = c(0.0,0),breaks = seq(0,3500,by = 250)) +
  scale_fill_brewer(name = 'Months',labels = month.name,palette = "Set3",direction = 1) +
  coord_flip()

  • Normally in any given year, most number of matches were played in first quarter
  • 1971 and 1972 are the beginning years where match happened only in a single monthyear
  • From the year 1991 we see some seasonal pattern interms of yerly number of matches. (the number of matches are relatively less in 1991, but in next consecutive 3 years it increased. Again number reduced in 1995 and tremendous increase in next 3 years)

——————————————————————————————————————

India’s highest runs against different opponents

dbGetQuery(connection,
"SELECT o.COUNTRY, o.VERSUS,o.MATCHDATE,o.PLAYER,o.RUNS FROM ODI o JOIN 
(SELECT COUNTRY,VERSUS, MAX(RUNS) AS highest_runs FROM ODI WHERE COUNTRY = 'India' GROUP BY COUNTRY,VERSUS) d
ON o.COUNTRY=d.COUNTRY AND o.VERSUS = d.VERSUS AND o.RUNS = d.highest_runs ORDER BY o.RUNS DESC") %>% 
  group_by(VERSUS) %>% summarise(RUNS = mean(RUNS)) %>% # Srilanka is twice,we need unique so
  ggplot() + geom_bar(aes(x = reorder(VERSUS,RUNS), y = RUNS),stat = "identity",fill = "springgreen") +
  labs(x="Opponent", y ="Maximum Runs",title = "Maximum Runs of India") + coord_flip() +
  theme_bw()

——————————————————————————————————————

Top number of centuries

SELECT PLAYER,COUNT(PLAYER) CENTURY FROM ODI WHERE RUNS >= 100 GROUP BY PLAYER ORDER BY CENTURY DESC FETCH NEXT 21 ROWS ONLY
Displaying records 1 - 10
PLAYER CENTURY
Sachin R Tendulkar 48
Ricky T Ponting 30
Sanath T Jayasuriya 28
Sourav C Ganguly 22
Herschelle H Gibbs 21
Saeed Anwar 20
Christopher H Gayle 19
Brian C Lara 19
Mark E Waugh 18
Desmond L Haynes 17
img <- readPNG("sachin-transpa-2.png") 
g <- rasterGrob(img, interpolate=TRUE) 

  dbGetQuery(connection,
             statement = "SELECT PLAYER,COUNT(PLAYER) CENTURY FROM ODI WHERE RUNS >= 100 GROUP BY PLAYER ORDER BY CENTURY DESC FETCH NEXT 21 ROWS ONLY") %>%
  ggplot(aes(x = reorder(PLAYER,desc(CENTURY)),y = CENTURY)) +
    annotation_custom(g, xmin=-Inf, xmax=Inf, ymin=-Inf, ymax=Inf) + 
    geom_bar(stat = "identity",fill = rainbow(21),alpha = 0.6) +
    geom_text(aes(label = CENTURY,size =CENTURY),show.legend = FALSE,vjust = - 0.09,color = rainbow(21)) +
    theme(panel.grid = element_blank(),
          panel.background = element_rect(fill = "gray80"),
          plot.background = element_rect(fill = "gray80"),
          axis.ticks = element_blank(),axis.text.x = element_text(angle = 90,hjust = 1.0)) +
    scale_y_continuous(limits = c(0,52),labels = NULL) + 
    labs(y = "", x = "Batsman", title = "Top number of centuries") +
    theme(plot.title = element_text(face = "bold",color = "white",hjust = 0.5,size = 12))

——————————————————————————————————————

Some functions dbSendQuery,dbGetStatement,dbGetRowsAffected,fetch

q = "SELECT COUNTRY,VERSUS,PLAYER,RUNS FROM ODI WHERE COUNTRY = 'India'"

rs = dbSendQuery(connection,q)
rs
Statement:            SELECT COUNTRY,VERSUS,PLAYER,RUNS FROM ODI WHERE COUNTRY = 'India' 
Rows affected:        0 
Row count:            0 
Select statement:     TRUE 
Statement completed:  FALSE 
OCI prefetch:         FALSE 
Bulk read:            1000 
Bulk write:           1000 
dbGetStatement(rs)
[1] "SELECT COUNTRY,VERSUS,PLAYER,RUNS FROM ODI WHERE COUNTRY = 'India'"
dbGetRowsAffected(rs)
[1] 0
dbColumnInfo(rs)
     name    Sclass     type len precision scale nullOK
1 COUNTRY character VARCHAR2 100         0     0   TRUE
2  VERSUS character VARCHAR2 100         0     0   TRUE
3  PLAYER character VARCHAR2 100         0     0   TRUE
4    RUNS   integer   NUMBER  NA         3     0   TRUE
dbHasCompleted(rs)
[1] FALSE
fetched1 = fetch(rs,25) # Once first n instances are fetched, they wont be fetched again,In the next fetch, fetching start from next instance only. If no mention of n, all instances are fetched 

fetched2 = fetch(rs) # If dont give n, it will fetch remaining instances from DBIresult as a subset : fetch2

dbHasCompleted(rs)
[1] TRUE
fetched1 # dont forget, It is dataframe
   COUNTRY       VERSUS             PLAYER RUNS
1    India South Africa Sachin R Tendulkar  200
2    India  New Zealand Sachin R Tendulkar  186
3    India    Australia Sachin R Tendulkar  175
4    India  New Zealand Sachin R Tendulkar  163
5    India      Namibia Sachin R Tendulkar  152
6    India     Zimbabwe Sachin R Tendulkar  146
7    India        Kenya Sachin R Tendulkar  146
8    India    Australia Sachin R Tendulkar  143
9    India     Pakistan Sachin R Tendulkar  141
10   India  West Indies Sachin R Tendulkar  141
11   India    Australia Sachin R Tendulkar  141
12   India        Kenya Sachin R Tendulkar  140
13   India    Australia Sachin R Tendulkar  139
14   India    Sri Lanka Sachin R Tendulkar  138
15   India    Sri Lanka Sachin R Tendulkar  137
16   India    Australia Sachin R Tendulkar  134
17   India    Sri Lanka Sachin R Tendulkar  128
18   India        Kenya Sachin R Tendulkar  127
19   India     Zimbabwe Sachin R Tendulkar  127
20   India     Zimbabwe Sachin R Tendulkar  124
21   India     Pakistan Sachin R Tendulkar  123
22   India South Africa Sachin R Tendulkar  122
23   India  West Indies Sachin R Tendulkar  122
24   India    Sri Lanka Sachin R Tendulkar  120
25   India      England Sachin R Tendulkar  120
fetched = rbind(fetched1,fetched2) # At the begining itself would have used fetched = fetch(rs)

dbGetRowCount(rs)  # s the number of rows fetched so far.
[1] 6609
dbClearResult(rs)  # after setting dbClearResult(rs); print(rs)  produces an error, as it is cleared
[1] TRUE

Fetching is helpful when there is limitation on fetching instances from database

——————————————————————————————————————

Top 20 Indian player’s interms of average score rate

SELECT COUNTRY,PLAYER,AVG(SCORERATE) AS AVG_SCORE_RATE FROM ODI WHERE COUNTRY = 'India' GROUP BY COUNTRY,PLAYER  ORDER BY AVG_SCORE_RATE DESC FETCH NEXT 20 ROWS ONLY
/* FETCH FIRST 20 ROWS ONLY */
Displaying records 1 - 10
COUNTRY PLAYER AVG_SCORE_RATE
India Paras L Mhambrey 140.00000
India Joginder Sharma 133.00000
India Prashant S Vaidya 116.50000
India Subroto T Banerjee 102.20000
India Saradindu P Mukherjee 100.00000
India Pankaj Dharmani 100.00000
India Pankaj Singh 100.00000
India Umesh Yadav 100.00000
India Yusuf K Pathan 95.85366
India Virender Sehwag 93.91739
q = "SELECT COUNTRY,PLAYER,AVG(SCORERATE) AS AVG_SCORE_RATE FROM ODI WHERE COUNTRY = 'India' GROUP BY COUNTRY,PLAYER  ORDER BY AVG_SCORE_RATE DESC"
rs = dbSendQuery(connection,q)
fetch(rs,20) %>% ggplot() + 
  geom_bar(aes(x = reorder(PLAYER,AVG_SCORE_RATE),y = AVG_SCORE_RATE),
           stat="identity",fill = rainbow(20),width = .6,alpha = 0.6) +
  labs(x = "Top 20 Players",y = "Average Score rate",title ="") + coord_flip() + theme_light()

dbClearResult(rs)
[1] TRUE

——————————————————————————————————————

Top Players interms of average score rate for all countries

There are some players who played once or twice and their score rate would be either very high or very low. So we consider a player who played more than 5 ODI’s (This choice is subjective, so please no question - WHY 5?)

 SELECT COUNT(*) FROM (SELECT COUNTRY,PLAYER, AVG(SCORERATE) AVG_SCORE_RATE,COUNT(*) MATCHES FROM ODI GROUP BY COUNTRY,PLAYER)
COUNT(*)
1937
SELECT COUNT(*) FROM (SELECT COUNTRY,PLAYER, AVG(SCORERATE) AVG_SCORE_RATE,COUNT(*) MATCHES FROM ODI HAVING COUNT(*) > 5 GROUP BY COUNTRY,PLAYER)
COUNT(*)
1174
Almost 763 Players played 5 or less matches
q = "SELECT COUNTRY,PLAYER, AVG(SCORERATE) AVG_SCORE_RATE,COUNT(*) MATCHES FROM ODI HAVING COUNT(*) > 5 GROUP BY COUNTRY,PLAYER"
rs = dbSendQuery(connection,q)
fetched = fetch(rs) #dbFetch is also useful
country_list = split(x = fetched,f = fetched$COUNTRY)

top_20_each <- lapply(X = country_list, FUN = function(x) top_n(x,20))

g = list()
j = 1
while(j <= length(top_20_each)){
  for(i in names(top_20_each)){
    g[[j]] = ggplot(top_20_each[[i]],aes(x = reorder(PLAYER,AVG_SCORE_RATE),y = AVG_SCORE_RATE)) + 
      geom_bar(stat = "identity",width = 0.25,fill = "white",color = "steelblue1") +
      scale_y_continuous(breaks = seq(0,120,by = 40),limits = c(0,120),labels = seq(0,120,by = 40)) +
      labs(x = "",y = "",title = i) +
      theme_minimal() +
      theme(plot.title = element_text(size = 8),axis.title = element_text(size = 7)) +
      coord_flip()
    j = j+1
  }
}

do.call(gridExtra::grid.arrange,c(g,ncol = 3))

dbClearResult(rs)
[1] TRUE

——————————————————————————————————————

Correlation between Runs and scorerate for England Players

q = "SELECT PLAYER,RUNS,SCORERATE FROM ODI WHERE COUNTRY = 'England'"

dbGetQuery(connection,statement = q) %>%
  group_by(PLAYER) %>%
  summarise(correlation = cor(RUNS,SCORERATE)) %>%
  filter(abs(correlation) > 0.8) %>%
  ggplot(aes(x = reorder(PLAYER,correlation),y = correlation)) +
  geom_dotplot(binaxis='y', stackdir='center',
           stat = "identity",radius = 0.2, fill = rainbow(42)) +
  labs(x = "Player",title = "Correlation between Score rate and Runs",subtitle = "[For England Only]") +
  theme(panel.grid.major.x = element_blank(),
        panel.grid.minor.x = element_blank(),
        plot.background = element_rect(fill = "gray90")) +
  coord_flip()

# dbGetQuery(connection,statement = q) %>% 
#   group_by(PLAYER) %>% 
#   summarise(correlation = cor(RUNS,SCORERATE)) %>% 
#   filter(abs(correlation) > 0.8) %>% 
#   ggplot(aes(x = reorder(PLAYER,correlation),y = correlation)) +
#   geom_point(size = 5, color = rainbow(42),pch =19) +
#   labs(x = "Player",title = "Correlation between Score rate and Runs") +
#   theme(panel.grid = element_blank(),plot.background = element_rect(fill = "gray90")) +
#   coord_flip()

Here we have noticed two outliers, Alex Jeremy Tudor and Geoffrey W Humpage, whose correlation is perfect -ve.

——————————————————————————————————————

q = "SELECT*FROM ODI WHERE COUNTRY = 'England' AND PLAYER IN ('Alex Jeremy Tudor','Geoffrey W Humpage')"
rs = dbSendQuery(connection,q)
outliers = dbFetch(res = rs, n = -1)
outliers= mutate(outliers,balls_faced = SCORERATE*RUNS/100)
outliers
  COUNTRY             PLAYER RUNS SCORERATE  MATCHDATE       GROUND
1 England Geoffrey W Humpage    6        43 1981-06-08   Headingley
2 England Geoffrey W Humpage    5        50 1981-06-06    Edgbaston
3 England  Alex Jeremy Tudor    6        67 2002-07-07 Old Trafford
4 England  Alex Jeremy Tudor    3       150 2002-07-09     The Oval
     VERSUS balls_faced
1 Australia        2.58
2 Australia        2.50
3 Sri Lanka        4.02
4     India        4.50
dbClearResult(rs)
[1] TRUE

This is because both the players played only in 1981 and 2002 and not even played for a complete over.

The rare players

Number of players played only once (In time period from 1971 to 2011)

q = "SELECT COUNTRY,PLAYER,COUNT(PLAYER) AS MATCHES  FROM ODI GROUP BY COUNTRY,PLAYER HAVING COUNT(PLAYER) = 1"
rs = dbSendQuery(connection,q)
rare = dbFetch(res = rs, n = -1)
rare %>% group_by(COUNTRY) %>% summarise(rare_players = sum(MATCHES)) %>% arrange(desc(rare_players))
# A tibble: 20 x 2
                COUNTRY rare_players
                  <chr>        <dbl>
 1             Pakistan           28
 2            Australia           24
 3                India           23
 4              England           21
 5          New Zealand           17
 6            Sri Lanka           17
 7          West Indies           17
 8           Bangladesh           14
 9               Canada           10
10 United Arab Emirates           10
11         South Africa            9
12             Zimbabwe            9
13          Afghanistan            7
14          Netherlands            5
15            Hong Kong            3
16        United States            3
17              Bermuda            2
18          East Africa            2
19              Ireland            2
20             Scotland            2
dbClearResult(rs)
[1] TRUE

Totally 225 players in ODI from 1971 to 2011 played once.

——————————————————————————————————————

Long time career in ODI

Note that this is different than the number of ODI’s played

q = 'SELECT PLAYER ,BEGIN,"END",MED,"END" - BEGIN AS DAYS,("END" - BEGIN)/365 AS YEAR FROM
(SELECT PLAYER,MIN(MATCHDATE) BEGIN,MAX(MATCHDATE) "END",MEDIAN(MATCHDATE) MED FROM ODI GROUP BY PLAYER) ORDER BY YEAR DESC FETCH NEXT 20 ROWS ONLY'
# aes(x = reorder(PLAYER,DIFF),y=DIFF)
rs = dbSendQuery(connection,q)
dbFetch(rs,n= -1) %>% ggplot() + 
  geom_errorbar(aes(x = reorder(PLAYER,YEAR),ymin=BEGIN,ymax = END),stat="identity",size = 7.5,color = rainbow(20))+
  geom_text(aes(x = reorder(PLAYER,YEAR),y=MED,label = paste(PLAYER,',',round(YEAR,2),'years'),size = 7)) +
  theme(axis.text.y = element_blank(),
        legend.position = 'none',
        axis.ticks = element_blank(),
        axis.title.y = element_blank(),
        axis.text.x = element_text(angle = 90),
        panel.background = element_rect(fill = 'gray70'),
        panel.grid.major.y = element_line(colour = "gray50",linetype = 3),
        panel.grid.major.x = element_line(colour = "gray50",linetype = 2),
        panel.grid.minor.x = element_line(colour = "gray50",linetype = 2),
        plot.background = element_rect(fill = 'gray70')) +
  labs(x = "",y = "Year") + 
  coord_flip()

dbClearResult(rs)
[1] TRUE

——————————————————————————————————————

Stephen B Smith (Outlier in above graph

Firstly let us see what are different percentiles(25th,50th,75th and 90th) of all years played by these players.

SELECT PLAYER ,BEGIN,"END",PERCENTILE_25,"MEDIAN",PERCENTILE_75,PERCENTILE_95,"END" - BEGIN AS DAYS,("END" - BEGIN)/365 AS YEARS FROM
(SELECT PLAYER,MIN(MATCHDATE) BEGIN,MAX(MATCHDATE) "END",
PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY MATCHDATE ASC) PERCENTILE_25 ,
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY MATCHDATE ASC) "MEDIAN",
PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY MATCHDATE ASC) PERCENTILE_75 ,
PERCENTILE_CONT(0.95) WITHIN GROUP (ORDER BY MATCHDATE ASC) PERCENTILE_95 FROM ODI GROUP BY PLAYER)
ORDER BY YEARS DESC FETCH NEXT 10 ROWS ONLY
Displaying records 1 - 10
PLAYER BEGIN END PERCENTILE_25 MEDIAN PERCENTILE_75 PERCENTILE_95 DAYS YEARS
Stephen B Smith 1983-02-06 2011-10-23 1984-01-21 00:00:00 1984-02-08 00:00:00 1985-01-23 00:00:00 1985-02-09 04:48:00 10486 28.72877
Michael J Smith 1973-09-05 1999-05-31 1974-07-26 18:00:00 1987-01-08 12:00:00 1999-05-23 00:00:00 1999-05-29 04:48:00 9399 25.75068
Sanath T Jayasuriya 1989-12-26 2011-06-28 1996-10-04 00:00:00 2001-02-03 00:00:00 2005-01-10 00:00:00 2009-01-14 19:11:59 7854 21.51781
Sachin R Tendulkar 1989-12-18 2011-04-02 1996-04-15 12:00:00 1999-11-15 12:00:00 2004-12-13 00:00:00 2009-10-23 13:11:59 7775 21.30137
Javed Miandad 1975-06-11 1996-03-09 1984-02-19 12:00:00 1987-03-30 00:00:00 1990-11-07 06:00:00 1993-03-05 03:35:59 7577 20.75890
Pinnaduwage A de Silva 1984-03-31 2003-03-18 1990-12-04 00:00:00 1995-04-03 12:00:00 1998-04-07 12:00:00 2002-12-01 18:00:00 6926 18.97534
Graham A Gooch 1976-08-26 1995-01-10 1981-07-20 12:00:00 1987-11-21 00:00:00 1991-04-29 00:00:00 1993-05-20 21:35:59 6711 18.38630
Wasim Akram 1985-02-16 2003-03-01 1991-08-03 00:00:00 1995-02-24 00:00:00 1999-05-20 18:00:00 2002-08-17 01:12:00 6587 18.04658
Grant W Flower 1992-10-25 2010-10-17 1997-10-01 18:00:00 1999-12-11 12:00:00 2001-10-05 06:00:00 2003-09-12 18:00:00 6566 17.98904
Muttiah Muralitharan 1993-09-02 2011-03-18 1998-08-15 00:00:00 2001-11-04 00:00:00 2006-02-08 12:00:00 2009-02-07 16:47:59 6406 17.55068

We see that almost 95% of matches played by Stephen B Smith are on or before 1985-02-04.

q = "SELECT TO_CHAR(MATCHDATE,'YYYY') STEPHEN_YEARS FROM ODI WHERE COUNTRY = 'Australia' AND PLAYER = 'Stephen B Smith'"
dbGetQuery(connection,statement = q) %>% table
.
1983 1984 1985 2011 
   6   12    6    1 

I wonder whether Stephen B Smith played in ODI after 25 years (in 2011 since 1985)? OR another player with same name entered in Australian team. This dataset doesen’t have evidence for that.

Finally disconnecting from database

dbDisconnect(connection)
[1] TRUE

——————————————————————————————————————

REFERENCE : https://cran.r-project.org/web/packages/ROracle/ROracle.pdf

****************************************************************************************************