Mahbubul Majumder, PhD
Nov 11, 2014
Database is an organised collection of data
Database management system (DBMS)
MySQL
, ORACLE
, MS SQL
, DB2
, MS ACCESS
, Informix
RDBMS stands for Relational Database Management System
SQL is used to work with DBMS
SELECT
, UPDATE
, DELETE
, INSERT
CREATE
, ALTER
, DROP
FROM
, WHERE
, GROUP BY
, ORDER BY
SELECT <column names seperated by comma>
FROM <database table>
WHERE <condition>
GROUP BY <column name>
ORDER BY <column name>
SELECT
commandsmysql -u training -p
show
. For example this code will display all the available databases for user training. show databases;
use
. For example we select the training trainingDB
as followsuse trainingDB;
show tables;
+----------------------+
| Tables_in_trainingdb |
+----------------------+
| hubstation |
| hubway |
| titanic |
+----------------------+
3 rows in set (0.00 sec)
describe titanic;
describe titanic;
+-----------+--------------+------+-----+---------+-------+
| Field | Type | Null | Key | Default | Extra |
+-----------+--------------+------+-----+---------+-------+
| pclass | char(3) | YES | | NULL | |
| survived | char(1) | YES | | NULL | |
| name | varchar(34) | YES | | NULL | |
| sex | char(6) | YES | | NULL | |
| age | decimal(6,4) | YES | | NULL | |
| sibsp | int(11) | YES | | NULL | |
| parch | int(11) | YES | | NULL | |
| ticket | varchar(20) | YES | | NULL | |
| fare | decimal(7,4) | YES | | NULL | |
| cabin | varchar(16) | YES | | NULL | |
| embarked | varchar(12) | YES | | NULL | |
| boat | char(2) | YES | | NULL | |
| body | int(11) | YES | | NULL | |
| home_dest | varchar(32) | YES | | NULL | |
+-----------+--------------+------+-----+---------+-------+
select name, pclass, age, sex from titanic limit 10;
+---------------------------------+--------+---------+--------+
| name | pclass | age | sex |
+---------------------------------+--------+---------+--------+
| Allen, Miss. Elisabeth Walton | 1st | 29.0000 | female |
| Allison, Master. Hudson Trevor | 1st | 0.9167 | male |
| Allison, Miss. Helen Loraine | 1st | 2.0000 | female |
| Allison, Mr. Hudson Joshua Crei | 1st | 30.0000 | male |
| Allison, Mrs. Hudson J C (Bessi | 1st | 25.0000 | female |
| Anderson, Mr. Harry | 1st | 48.0000 | male |
| Andrews, Miss. Kornelia Theodos | 1st | 63.0000 | female |
| Andrews, Mr. Thomas Jr | 1st | 39.0000 | male |
| Appleton, Mrs. Edward Dale (Cha | 1st | 53.0000 | female |
| Artagaveytia, Mr. Ramon | 1st | 71.0000 | male |
+---------------------------------+--------+---------+--------+
10 rows in set (0.00 sec)
select pclass, survived, avg(age) as avg_age from titanic
group by pclass, survived;
+--------+----------+-------------+
| pclass | survived | avg_age |
+--------+----------+-------------+
| 1st | 0 | 43.24757282 |
| 1st | 1 | 36.83379392 |
| 2nd | 0 | 33.16780822 |
| 2nd | 1 | 24.85869565 |
| 3rd | 0 | 25.97454946 |
| 3rd | 1 | 21.54516565 |
+--------+----------+-------------+
select pclass, survived, avg(age) as avg_age from titanic
where survived=1
group by pclass, survived;
+--------+----------+-------------+
| pclass | survived | avg_age |
+--------+----------+-------------+
| 1st | 1 | 36.83379392 |
| 2nd | 1 | 24.85869565 |
| 3rd | 1 | 21.54516565 |
+--------+----------+-------------+
select name, pclass, survived, age from titanic
where name regexp '^.sen.+';
+---------------------+--------+----------+---------+
| name | pclass | survived | age |
+---------------------+--------+----------+---------+
| Osen, Mr. Olaf Elon | 3rd | 0 | 16.0000 |
+---------------------+--------+----------+---------+
select name, pclass, survived, age from titanic
where name regexp 'Jakob.*$';
+----------------------------+--------+----------+---------+
| name | pclass | survived | age |
+----------------------------+--------+----------+---------+
| Birnbaum, Mr. Jakob | 1st | 0 | 25.0000 |
| Johanson, Mr. Jakob Alfred | 3rd | 0 | 34.0000 |
| Pasic, Mr. Jakob | 3rd | 0 | 21.0000 |
| Wiklund, Mr. Jakob Alfred | 3rd | 0 | 18.0000 |
+----------------------------+--------+----------+---------+
select name, pclass, survived, age from titanic
where name regexp 'Ivar.*$';
+------------------------------+--------+----------+---------+
| name | pclass | survived | age |
+------------------------------+--------+----------+---------+
| Givard, Mr. Hans Kristensen | 2nd | 0 | 30.0000 |
| Berglund, Mr. Karl Ivar Sven | 3rd | 0 | 22.0000 |
+------------------------------+--------+----------+---------+
,.*
' makes sure that the expression matches only the first name. select name, pclass, survived, age from titanic
where name regexp ',.*Ivar.*$';
+------------------------------+--------+----------+---------+
| name | pclass | survived | age |
+------------------------------+--------+----------+---------+
| Berglund, Mr. Karl Ivar Sven | 3rd | 0 | 22.0000 |
+------------------------------+--------+----------+---------+
install.packages("RMySQL")
library(RMySQL)
library(RMySQL)
con = dbConnect(MySQL(),user="training", password="training123",
dbname="trainingDB", host="localhost")
myQuery <- "select pclass, survived, avg(age) from titanic
where survived=1 group by pclass;"
dbGetQuery(con, myQuery)
pclass survived avg(age)
1 1st 1 36.83379
2 2nd 1 24.85870
3 3rd 1 21.54517
myQuery <- "select pclass, avg(fare) as avg_fare from titanic
group by pclass;"
myData <- dbGetQuery(con, myQuery)
library(ggplot2)
ggplot(myData, aes(pclass, avg_fare)) + geom_bar(stat="identity")
R
package dplyr
considers database tables as data framelibrary(dplyr)
conDplyr = src_mysql(dbname = "trainingDB", user = "training",
password = "training123", host = "localhost")
myData <- conDplyr %>%
tbl("titanic") %>%
select(pclass, sex, age, fare, survived, parch) %>%
filter(survived==0) %>%
collect()
head(myData)
Source: local data frame [6 x 6]
pclass sex age fare survived parch
1 1st female 2 151.5500 0 2
2 1st male 30 151.5500 0 2
3 1st female 25 151.5500 0 2
4 1st male 39 0.0000 0 0
5 1st male 71 49.5042 0 0
6 1st male 47 227.5250 0 0
myData <- conDplyr %>%
tbl("titanic") %>%
select(pclass, sex, survived) %>%
group_by(pclass, sex) %>%
summarise(survival_ratio = mean(survived)) %>%
collect()
ggplot(myData, aes(pclass,survival_ratio, color=sex, group=sex)) +
geom_point(size=3) + geom_line()
dplyr
delays the actual operation until necessaryR
from the database only when we need itmyDF <- tbl(conDplyr,"titanic")
myDF1 <- filter(myDF, fare > 150)
myDF2 <- select(myDF1, pclass,sex,age,fare)
myDF3 <- group_by(myDF2, pclass,sex)
myDF4 <- summarise(myDF3,
avg_age = mean(age),
avg_fare = mean(fare))
R
head(myDF4, n=2)
pclass sex avg_age avg_fare
1 1st female 35.15152 232.9025
2 1st male 37.58854 248.7482
myDF4$query
<Query> SELECT `pclass`, `sex`, `avg_age`, `avg_fare`
FROM (SELECT `pclass`, `sex`, AVG(`age`) AS `avg_age`, AVG(`fare`) AS `avg_fare`
FROM `titanic`
WHERE `fare` > 150.0
GROUP BY `pclass`, `sex`) AS `_W2`
<MySQLConnection:(13596,1)>
myDF2$query
<Query> SELECT `pclass` AS `pclass`, `sex` AS `sex`, `age` AS `age`, `fare` AS `fare`
FROM `titanic`
WHERE `fare` > 150.0
<MySQLConnection:(13596,1)>
collect(myDF4)
Source: local data frame [2 x 4]
Groups: pclass
pclass sex avg_age avg_fare
1 1st female 35.15152 232.9025
2 1st male 37.58854 248.7482
head(collect(myDF2))
Source: local data frame [6 x 4]
pclass sex age fare
1 1st female 29.0000 211.3375
2 1st male 0.9167 151.5500
3 1st female 2.0000 151.5500
4 1st male 30.0000 151.5500
5 1st female 25.0000 151.5500
6 1st male 47.0000 227.5250
conDplyr %>%
tbl("titanic") %>%
filter(fare > 150) %>%
select(pclass,sex,age,fare) %>%
group_by(pclass,sex) %>%
summarise(avg_age = mean(age),
avg_fare = mean(fare))
Source: mysql 5.6.20 [training@localhost:/trainingDB]
From: <derived table> [?? x 4]
Grouped by: pclass
pclass sex avg_age avg_fare
1 1st female 35.15152 232.9025
2 1st male 37.58854 248.7482
.. ... ... ... ...
While package RMySQL provides connectivity to MySQL database, you can use package RODBC
to connect various databases.
ODBC
stands for Open Database Connectivity
RODBC
does the same for R
install.packages("RODBC")
library(ODBC)
dplyr
can work under ODBC
connection.MySQL 5.7 Reference Manual
http://dev.mysql.com/doc/refman/5.7/en/index.html
For details about RMySQL
functionality review this document. For example, this package provides functions to upload data from directly dataframe.
http://cran.r-project.org/web/packages/RMySQL/RMySQL.pdf
dplyr
vignettes that describe database issues
http://cran.r-project.org/web/packages/dplyr/vignettes/databases.html