Mahbubul Majumder, PhD
Oct 23, 2014
Each web site is a source of data
Three ways to obtain data from the web
R
and look for useful informationWeb data could be structured or unstructured or semi-structured
Sources of data through API
Often we see data in HTML tables
How would you get data from an HTML table
Fortunately we can do it automatically using package XML
# install.packages("XML")
library(XML)
First inspect the data by going to the HTML link
As you scroll your mouse over the elements it will highlight in the page
Demonstration with getting passing yards data
u0 <- "http://www.nfl.com/stats/categorystats?tabSeq=0&"
u1 <- "season=2012&seasonType=PRE&experience=&Submit=Go&"
u2 <- "archive=true&conference=null&statisticCategory=PASSING&"
url1 <- paste(u0,u1,u2,"d-447263-p=1&qualified=false", sep="")
url2 <- paste(u0,u1,u2,"d-447263-p=2&qualified=false", sep="")
url3 <- paste(u0,u1,u2,"d-447263-p=3&qualified=false", sep="")
# Scan the table
tables1 <- readHTMLTable(url1)
tables2 <- readHTMLTable(url2)
tables3 <- readHTMLTable(url3)
# In HTML codes id="result" contains the desired data
bd1 <- tables1$result
bd2 <- tables2$result
bd3 <- tables3$result
# combine the multiple pages of data together
nflData <- rbind(bd1,bd2,bd3)
names(nflData)
[1] "Rk" "Player" "Team" "Pos" "\nComp" "\nAtt" "\nPct"
[8] "\nAtt/G" "\nYds" "\nAvg" "\nYds/G" "\nTD" "\nInt" "\n1st"
[15] "\n1st%" "\nLng" "\n20+" "\n40+" "\nSck" "\nRate"
names(nflData) <- gsub("\n","",names(nflData))
is.factor(nflData$Rate)
[1] TRUE
nflData$Rate = as.numeric(as.character(nflData$Rate))
# get top 15 players based on Rating
datPlot <- nflData[order(nflData$Rate,decreasing=T)[1:15],]
ggplot(datPlot, aes(reorder(Player,Rate), Rate)) +
geom_point(stat = "identity") + coord_flip() + theme_bw()+
ylab("Rating") + xlab("Player")
twitter provides API to obtain tweets information
Use R
package twitteR
to extract and work with twitter data
twitteR
from github
not CRAN
what you usually dotwitter data some times provides interesting information about current incidents
Challenge is how to handle various languages and grammar or spelling
R
(for details https://github.com/geoffjentry/twitteR/)#install.packages(c("devtools", "rjson", "bit64", "httr"))
library(devtools)
#install_github("twitteR", username="geoffjentry")
library(twitteR)
api_key <- "your api key"
api_secret <- "your api secret"
access_token <- "your access tocken"
access_token_secret <- "your access tocken secret"
setup_twitter_oauth(api_key,api_secret,access_token,
access_token_secret)
[1] "Using direct authentication"
dsTweet <- searchTwitter("data science")
dfTweet <- twListToDF(dsTweet)
tweetText <- dfTweet$text
head(tweetText)
[1] "RT @KirkDBorne: Get the 1-page R Survival Guides for #DataScientists: http://t.co/s1nfEdr7rx #abdsc #BigData #Analytics #Rstats http://t.co…"
[2] "RT @Steve_Burkett: #Football Scouting actually is a science not rocket science but does require advanced data analysis techniques to be suc…"
[3] "RT @Steve_Burkett: #Football Scouting actually is a science not rocket science but does require advanced data analysis techniques to be suc…"
[4] "RT @KirkDBorne: Get the 1-page R Survival Guides for #DataScientists: http://t.co/s1nfEdr7rx #abdsc #BigData #Analytics #Rstats http://t.co…"
[5] "Reminder for Today's Event http://t.co/rc0BYMMFkd"
[6] "RT @Steve_Burkett: #Football Scouting actually is a science not rocket science but does require advanced data analysis techniques to be suc…"
user <- getUser("drsanjaygupta")
sanjay <- userTimeline(user, n=500)
snDf <- twListToDF(sanjay)
snText <- snDf$text
library(stringr)
snWords <- str_extract_all(snText, '(\\w+)')
head(snWords)
[[1]]
[1] "MsNikki214" "I" "really" "appreciate" "that"
[6] "Thank" "you"
[[2]]
[1] "NY" "amp" "NJ" "quarantine" "rule"
[6] "will" "cover" "anyone" "not" "just"
[11] "health" "care" "workers" "known" "to"
[16] "have" "had" "contact" "with" "an"
[21] "Ebola" "patient"
[[3]]
[1] "NY" "amp" "NJ" "health" "departments"
[6] "order" "21" "day" "quarantine" "for"
[11] "all" "returning" "health" "care" "workers"
[16] "who" "ve" "had" "direct" "contact"
[21] "with" "Ebola" "patients"
[[4]]
[1] "here" "s" "why" "you" "don"
[6] "t" "need" "to" "worry" "about"
[11] "ebola" "even" "though" "there" "s"
[16] "now" "another" "case" "in" "the"
[21] "US" "http" "t" "co" "bUx9I7T3Ta"
[[5]]
[1] "Here" "is" "a" "good" "time"
[6] "line" "for" "Dr" "Craig" "Spencer"
[11] "Ebola" "ebol" "http" "t" "co"
[16] "E5Sks6ylWb"
[[6]]
[1] "Admin" "official" "just" "told" "me"
[6] "considering" "mandatory" "quarantine" "for" "healthcare"
[11] "workers" "returning" "from" "W" "Africa"
[16] "We" "understand" "public" "fear"
snHashtags = str_extract_all(snDf$text, "#\\w+")
freq <- table(unlist(snHashtags))
library(wordcloud)
wordcloud(names(freq), freq, random.order=FALSE, colors="#1B9E77")
title("\n\nWhat Dr. Sanjay is talking about",
cex.main=1.5, col.main="gray50")
r_tweets <- searchTwitter("#rstats", n=300)
sources <- sapply(r_tweets, function(x) x$getStatusSource())
sources <- gsub("</a>", "", sources)
sources <- strsplit(sources, ">")
sources <- sapply(sources, function(x) ifelse(length(x) > 1, x[2], x[1]))
source_table = table(sources)
df <- data.frame(names(source_table),source_table)
ggplot(df[df$Freq>8,], aes(reorder(sources,Freq),Freq)) +
geom_bar(stat="identity") + coord_flip()
For details about setting up twitteR
package in R
https://github.com/geoffjentry/twitteR/
For examples of twitteR
functions and what this package can deliver visit
http://geoffjentry.hexdump.org/twitteR.pdf