Section 46 Merge Data: using sqldf


sqldf is an R package for running SQL statements on R data frames, optimized for convenience. sqldf works with the SQLite, H2, PostgreSQL or MySQL databases. The library supports many SQL arguments like WHERE, GROUP BY, HAVING, UPDATE, SET, SHOW, SELECT, ORDER BY etc. See the help file for further details.

Note that the current version does not support RIGHT and FULL OUTER JOIN.


46.1 Function sqldf

sqldf(x, stringsAsFactors = FALSE,
   row.names = FALSE, envir = parent.frame(), 
   method = getOption("sqldf.method"),
   file.format = list(), dbname, drv = getOption("sqldf.driver"), 
   user, password = "", host = "localhost", port,
   dll = getOption("sqldf.dll"), connection = getOption("sqldf.connection"),
   verbose = isTRUE(getOption("sqldf.verbose")))


46.2 Data

library(sqldf)

set.seed(123)

# DF1
ID <- paste0('S',1:5)
Sex <- sample(x = c('M','F'), size = length(ID), replace = TRUE)
Area <- c('A1','A1','A2','A3', NA)
DF1 <- data.frame(ID=ID, Sex=Sex, Area=Area, stringsAsFactors=FALSE)

# DF2
Area <- c('A1','A2','A3','A4')
Epi <- c('Da','Db','Dc','D4')
DF2 <- data.frame(Area=Area, Epi=Epi, stringsAsFactors=FALSE)

46.2.1 DF1

46.2.2 DF2

46.3 Inner Join

# Inner Join

sSQL <- 'SELECT * FROM DF1 INNER JOIN DF2 ON DF1.Area = DF2.Area'

# sSQL <- 'SELECT * FROM DF1 INNER JOIN DF2 USING (Area, Area)'

m1DF <- sqldf(sSQL)


46.4 Left Join

# Left Join

sSQL <- 'SELECT * FROM DF1 LEFT JOIN DF2 ON DF1.Area = DF2.Area'

m2DF <- sqldf(sSQL)