Mahbubul Majumder, PhD
Nov 11, 2014
Database is an organised collection of data
Database management system (DBMS)
MySQL, ORACLE, MS SQL, DB2, MS ACCESS, InformixRDBMS stands for Relational Database Management System
SQL is used to work with DBMS
SELECT, UPDATE, DELETE, INSERT CREATE, ALTER, DROPFROM, WHERE, GROUP BY, ORDER BYSELECT <column names seperated by comma>
FROM <database table>
WHERE <condition>
GROUP BY <column name>
ORDER BY <column name>
SELECT commandsmysql -u training -p
show. For example this code will display all the available databases for user training. show databases;
use. For example we select the training trainingDB as followsuse trainingDB;
show tables;
+----------------------+
| Tables_in_trainingdb |
+----------------------+
| hubstation |
| hubway |
| titanic |
+----------------------+
3 rows in set (0.00 sec)
describe titanic;
describe titanic;
+-----------+--------------+------+-----+---------+-------+
| Field | Type | Null | Key | Default | Extra |
+-----------+--------------+------+-----+---------+-------+
| pclass | char(3) | YES | | NULL | |
| survived | char(1) | YES | | NULL | |
| name | varchar(34) | YES | | NULL | |
| sex | char(6) | YES | | NULL | |
| age | decimal(6,4) | YES | | NULL | |
| sibsp | int(11) | YES | | NULL | |
| parch | int(11) | YES | | NULL | |
| ticket | varchar(20) | YES | | NULL | |
| fare | decimal(7,4) | YES | | NULL | |
| cabin | varchar(16) | YES | | NULL | |
| embarked | varchar(12) | YES | | NULL | |
| boat | char(2) | YES | | NULL | |
| body | int(11) | YES | | NULL | |
| home_dest | varchar(32) | YES | | NULL | |
+-----------+--------------+------+-----+---------+-------+
select name, pclass, age, sex from titanic limit 10;
+---------------------------------+--------+---------+--------+
| name | pclass | age | sex |
+---------------------------------+--------+---------+--------+
| Allen, Miss. Elisabeth Walton | 1st | 29.0000 | female |
| Allison, Master. Hudson Trevor | 1st | 0.9167 | male |
| Allison, Miss. Helen Loraine | 1st | 2.0000 | female |
| Allison, Mr. Hudson Joshua Crei | 1st | 30.0000 | male |
| Allison, Mrs. Hudson J C (Bessi | 1st | 25.0000 | female |
| Anderson, Mr. Harry | 1st | 48.0000 | male |
| Andrews, Miss. Kornelia Theodos | 1st | 63.0000 | female |
| Andrews, Mr. Thomas Jr | 1st | 39.0000 | male |
| Appleton, Mrs. Edward Dale (Cha | 1st | 53.0000 | female |
| Artagaveytia, Mr. Ramon | 1st | 71.0000 | male |
+---------------------------------+--------+---------+--------+
10 rows in set (0.00 sec)
select pclass, survived, avg(age) as avg_age from titanic
group by pclass, survived;
+--------+----------+-------------+
| pclass | survived | avg_age |
+--------+----------+-------------+
| 1st | 0 | 43.24757282 |
| 1st | 1 | 36.83379392 |
| 2nd | 0 | 33.16780822 |
| 2nd | 1 | 24.85869565 |
| 3rd | 0 | 25.97454946 |
| 3rd | 1 | 21.54516565 |
+--------+----------+-------------+
select pclass, survived, avg(age) as avg_age from titanic
where survived=1
group by pclass, survived;
+--------+----------+-------------+
| pclass | survived | avg_age |
+--------+----------+-------------+
| 1st | 1 | 36.83379392 |
| 2nd | 1 | 24.85869565 |
| 3rd | 1 | 21.54516565 |
+--------+----------+-------------+
select name, pclass, survived, age from titanic
where name regexp '^.sen.+';
+---------------------+--------+----------+---------+
| name | pclass | survived | age |
+---------------------+--------+----------+---------+
| Osen, Mr. Olaf Elon | 3rd | 0 | 16.0000 |
+---------------------+--------+----------+---------+
select name, pclass, survived, age from titanic
where name regexp 'Jakob.*$';
+----------------------------+--------+----------+---------+
| name | pclass | survived | age |
+----------------------------+--------+----------+---------+
| Birnbaum, Mr. Jakob | 1st | 0 | 25.0000 |
| Johanson, Mr. Jakob Alfred | 3rd | 0 | 34.0000 |
| Pasic, Mr. Jakob | 3rd | 0 | 21.0000 |
| Wiklund, Mr. Jakob Alfred | 3rd | 0 | 18.0000 |
+----------------------------+--------+----------+---------+
select name, pclass, survived, age from titanic
where name regexp 'Ivar.*$';
+------------------------------+--------+----------+---------+
| name | pclass | survived | age |
+------------------------------+--------+----------+---------+
| Givard, Mr. Hans Kristensen | 2nd | 0 | 30.0000 |
| Berglund, Mr. Karl Ivar Sven | 3rd | 0 | 22.0000 |
+------------------------------+--------+----------+---------+
,.*' makes sure that the expression matches only the first name. select name, pclass, survived, age from titanic
where name regexp ',.*Ivar.*$';
+------------------------------+--------+----------+---------+
| name | pclass | survived | age |
+------------------------------+--------+----------+---------+
| Berglund, Mr. Karl Ivar Sven | 3rd | 0 | 22.0000 |
+------------------------------+--------+----------+---------+
install.packages("RMySQL")
library(RMySQL)
library(RMySQL)
con = dbConnect(MySQL(),user="training", password="training123",
dbname="trainingDB", host="localhost")
myQuery <- "select pclass, survived, avg(age) from titanic
where survived=1 group by pclass;"
dbGetQuery(con, myQuery)
pclass survived avg(age)
1 1st 1 36.83379
2 2nd 1 24.85870
3 3rd 1 21.54517
myQuery <- "select pclass, avg(fare) as avg_fare from titanic
group by pclass;"
myData <- dbGetQuery(con, myQuery)
library(ggplot2)
ggplot(myData, aes(pclass, avg_fare)) + geom_bar(stat="identity")
R package dplyr considers database tables as data framelibrary(dplyr)
conDplyr = src_mysql(dbname = "trainingDB", user = "training",
password = "training123", host = "localhost")
myData <- conDplyr %>%
tbl("titanic") %>%
select(pclass, sex, age, fare, survived, parch) %>%
filter(survived==0) %>%
collect()
head(myData)
Source: local data frame [6 x 6]
pclass sex age fare survived parch
1 1st female 2 151.5500 0 2
2 1st male 30 151.5500 0 2
3 1st female 25 151.5500 0 2
4 1st male 39 0.0000 0 0
5 1st male 71 49.5042 0 0
6 1st male 47 227.5250 0 0
myData <- conDplyr %>%
tbl("titanic") %>%
select(pclass, sex, survived) %>%
group_by(pclass, sex) %>%
summarise(survival_ratio = mean(survived)) %>%
collect()
ggplot(myData, aes(pclass,survival_ratio, color=sex, group=sex)) +
geom_point(size=3) + geom_line()
dplyr delays the actual operation until necessaryR from the database only when we need itmyDF <- tbl(conDplyr,"titanic")
myDF1 <- filter(myDF, fare > 150)
myDF2 <- select(myDF1, pclass,sex,age,fare)
myDF3 <- group_by(myDF2, pclass,sex)
myDF4 <- summarise(myDF3,
avg_age = mean(age),
avg_fare = mean(fare))
Rhead(myDF4, n=2)
pclass sex avg_age avg_fare
1 1st female 35.15152 232.9025
2 1st male 37.58854 248.7482
myDF4$query
<Query> SELECT `pclass`, `sex`, `avg_age`, `avg_fare`
FROM (SELECT `pclass`, `sex`, AVG(`age`) AS `avg_age`, AVG(`fare`) AS `avg_fare`
FROM `titanic`
WHERE `fare` > 150.0
GROUP BY `pclass`, `sex`) AS `_W2`
<MySQLConnection:(13596,1)>
myDF2$query
<Query> SELECT `pclass` AS `pclass`, `sex` AS `sex`, `age` AS `age`, `fare` AS `fare`
FROM `titanic`
WHERE `fare` > 150.0
<MySQLConnection:(13596,1)>
collect(myDF4)
Source: local data frame [2 x 4]
Groups: pclass
pclass sex avg_age avg_fare
1 1st female 35.15152 232.9025
2 1st male 37.58854 248.7482
head(collect(myDF2))
Source: local data frame [6 x 4]
pclass sex age fare
1 1st female 29.0000 211.3375
2 1st male 0.9167 151.5500
3 1st female 2.0000 151.5500
4 1st male 30.0000 151.5500
5 1st female 25.0000 151.5500
6 1st male 47.0000 227.5250
conDplyr %>%
tbl("titanic") %>%
filter(fare > 150) %>%
select(pclass,sex,age,fare) %>%
group_by(pclass,sex) %>%
summarise(avg_age = mean(age),
avg_fare = mean(fare))
Source: mysql 5.6.20 [training@localhost:/trainingDB]
From: <derived table> [?? x 4]
Grouped by: pclass
pclass sex avg_age avg_fare
1 1st female 35.15152 232.9025
2 1st male 37.58854 248.7482
.. ... ... ... ...
While package RMySQL provides connectivity to MySQL database, you can use package RODBC to connect various databases.
ODBC stands for Open Database Connectivity
RODBC does the same for Rinstall.packages("RODBC")
library(ODBC)
dplyr can work under ODBC connection.MySQL 5.7 Reference Manual
http://dev.mysql.com/doc/refman/5.7/en/index.html
For details about RMySQL functionality review this document. For example, this package provides functions to upload data from directly dataframe.
http://cran.r-project.org/web/packages/RMySQL/RMySQL.pdf
dplyr vignettes that describe database issues
http://cran.r-project.org/web/packages/dplyr/vignettes/databases.html