Basically, there are two ways to collect data from web: via API and via Scraping.

1. Using API

Example 1.1: www.reddit.com

First, we need to find the API links (rules). For reddit.com, it is very simple - /.json. For example, https://www.reddit.com/ is the URL for the front page. The corresponding API link is https://www.reddit.com/.json. You can visit the link using any web browsers and you will see the data in JSON format.

Second, we can visit the API links and convert the JSON data to tables with R. There are several R packages to deal with JSON: A biased comparsion of JSON packages in R.

library(RCurl,quietly = T) # the package for http requests
library(rjson) # to prcoess json data

Request the data:

page = getURL("https://www.reddit.com/.json")
class(page)
## [1] "character"

Convert string to json. To check the data structure of json, you can use JSON Editor Online.

jsondata = fromJSON(page,unexpected.escape="skip")
class(jsondata)
## [1] "list"
names(jsondata)
## [1] "kind" "data"
names(jsondata$data)
## [1] "modhash"  "dist"     "children" "after"    "before"

Format to a table (author, domain, title, URL):

items = jsondata$data$children
authors = sapply(items,function(x) x$data$author)
authors
##  [1] "Rockstar408"          "debbies_a_whore"      "Stryker412"          
##  [4] "echoman94"            "AlmightyOne23"        "codethetron"         
##  [7] "skennedy987"          "DiiMMer"              "maybemil"            
## [10] "Twentysix2"           "EngagingData"         "roadtrippa88"        
## [13] "St0pX"                "Square_Crab"          "DontCallMeMartha"    
## [16] "Cartoons4adults"      "MiltonsRedStapler"    "-iwl-"               
## [19] "Snatat"               "worldpins"            "mvea"                
## [22] "life_style_change"    "ButtWieghtThiersMoor" "mvea"                
## [25] "theylied2you"

Another way is to define the function first.

extract_author = function(item){
  return(item$data$author)
}
extract_author(items[[1]])
## [1] "Rockstar408"

Sapply the function to each element in items (instead of loop):

authors = c()
for (item in items){
  author = extract_author(item)
  authors = c(authors,author)
}
authors == sapply(items,extract_author)
##  [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
domains = sapply(items,function(x) x$data$domain)
titles = sapply(items,function(x) x$data$title)
urls = sapply(items,function(x) x$data$url)

Combined to a data frame:

dp = data.frame(authors,domains,titles,urls,stringsAsFactors = F)
head(dp)
##           authors        domains
## 1     Rockstar408 streamable.com
## 2 debbies_a_whore self.AskReddit
## 3      Stryker412    youtube.com
## 4       echoman94      i.redd.it
## 5   AlmightyOne23      i.redd.it
## 6     codethetron      i.redd.it
##                                                    titles
## 1        [Highlight] Bjelica wins the game for the Kings!
## 2 What last minute gift for Christmas is always a winner?
## 3                         Ghostbusters: Afterlife trailer
## 4                        We Have Returned to a Golden Age
## 5                                             Release me!
## 6           Thought this belongs here... thank you friend
##                                                                                                  urls
## 1                                                                        https://streamable.com/ing5c
## 2 https://www.reddit.com/r/AskReddit/comments/e8d5ap/what_last_minute_gift_for_christmas_is_always_a/
## 3                                                         https://www.youtube.com/watch?v=ahZFCF--uRY
## 4                                                                 https://i.redd.it/d6qbp07eup341.jpg
## 5                                                                 https://i.redd.it/wkokma0jqp341.jpg
## 6                                                                 https://i.redd.it/c24m1epvlq341.jpg

Find next page URL:

nextp = jsondata$data$after
nextp
## [1] "t3_e8efz4"
nextp = paste0("https://www.reddit.com/.json?count=25&after=",nextp)
nextp
## [1] "https://www.reddit.com/.json?count=25&after=t3_e8efz4"

Now, we write a functions to extract data and next page URL.

extract_page = function(api_link){
  page = getURL(api_link)
  jsondata = fromJSON(page,unexpected.escape="skip")
  items = jsondata$data$children
  authors = sapply(items,function(x) x$data$author)
  domains = sapply(items,function(x) x$data$domain)
  titles = sapply(items,function(x) x$data$title)
  urls = sapply(items,function(x) x$data$url)
  dp = data.frame(authors,domains,titles,urls,stringsAsFactors = F)
  nextp = paste0("https://www.reddit.com/.json?count=25&after=",jsondata$data$after)
  return(list(dp,nextp))
}

d = extract_page("https://www.reddit.com/.json")
head(d[[1]]) #data
##           authors        domains
## 1     Rockstar408 streamable.com
## 2 debbies_a_whore self.AskReddit
## 3      Stryker412    youtube.com
## 4       echoman94      i.redd.it
## 5   AlmightyOne23      i.redd.it
## 6     codethetron      i.redd.it
##                                                    titles
## 1        [Highlight] Bjelica wins the game for the Kings!
## 2 What last minute gift for Christmas is always a winner?
## 3                         Ghostbusters: Afterlife trailer
## 4                        We Have Returned to a Golden Age
## 5                                             Release me!
## 6           Thought this belongs here... thank you friend
##                                                                                                  urls
## 1                                                                        https://streamable.com/ing5c
## 2 https://www.reddit.com/r/AskReddit/comments/e8d5ap/what_last_minute_gift_for_christmas_is_always_a/
## 3                                                         https://www.youtube.com/watch?v=ahZFCF--uRY
## 4                                                                 https://i.redd.it/d6qbp07eup341.jpg
## 5                                                                 https://i.redd.it/wkokma0jqp341.jpg
## 6                                                                 https://i.redd.it/c24m1epvlq341.jpg
d[[2]] #next
## [1] "https://www.reddit.com/.json?count=25&after=t3_e8efz4"

Put them together

library(dplyr,quietly = T)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
url = "https://www.reddit.com/.json"
data = data.frame()
n=0
while (!is.null(url)&n<=10){
  d = extract_page(url)
  data = bind_rows(data,d[[1]])
  url = d[[2]]
  n=n+1
  print(n)
}
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9
## [1] 10
## [1] 11
dim(data)
## [1] 275   4
head(data)
##           authors        domains
## 1     Rockstar408 streamable.com
## 2 debbies_a_whore self.AskReddit
## 3      Stryker412    youtube.com
## 4       echoman94      i.redd.it
## 5   AlmightyOne23      i.redd.it
## 6     codethetron      i.redd.it
##                                                    titles
## 1        [Highlight] Bjelica wins the game for the Kings!
## 2 What last minute gift for Christmas is always a winner?
## 3                         Ghostbusters: Afterlife trailer
## 4                        We Have Returned to a Golden Age
## 5                                             Release me!
## 6           Thought this belongs here... thank you friend
##                                                                                                  urls
## 1                                                                        https://streamable.com/ing5c
## 2 https://www.reddit.com/r/AskReddit/comments/e8d5ap/what_last_minute_gift_for_christmas_is_always_a/
## 3                                                         https://www.youtube.com/watch?v=ahZFCF--uRY
## 4                                                                 https://i.redd.it/d6qbp07eup341.jpg
## 5                                                                 https://i.redd.it/wkokma0jqp341.jpg
## 6                                                                 https://i.redd.it/c24m1epvlq341.jpg

2. Using Scraping

Example 2.1: HK discuss.

Sys.setlocale(locale="Chinese") # for Windows
## [1] "LC_COLLATE=Chinese (Simplified)_China.936;LC_CTYPE=Chinese (Simplified)_China.936;LC_MONETARY=Chinese (Simplified)_China.936;LC_NUMERIC=C;LC_TIME=Chinese (Simplified)_China.936"
#Sys.setlocale("LC_ALL", 'en_US.UTF-8') # for MAC
library(RCurl)
library(XML) # to parse html docs and use xpath
url="https://news.discuss.com.hk/forumdisplay.php?fid=54&threadMode=neutral"
source=getURL(url,.encoding='UTF-8')
#source=iconv(source, "big5", "UTF-8",sub = 'byte') 
page =htmlParse(source,encoding = 'UTF-8')
class(page)
## [1] "HTMLInternalDocument" "HTMLInternalDocument" "XMLInternalDocument" 
## [4] "XMLAbstractDocument"

Using xpath to extract users, titles, urls etc.

authors = xpathSApply(page,"//tbody[contains(@id,'normalthread')]//td[@class='author']/cite/a",xmlValue)
authors
##  [1] "Rebirth7"       "今天綠色羔羊"   "gavinsfxs"      "井底委屈的世界"
##  [5] "stkwok1985"     "東方大頭妹"     "高達原始的水管" "煙hai"         
##  [9] "石子宏"         "井底進擊的牛牛" "香港鶴雞輸甩褲" "fake06"        
## [13] "袋中野生的菠菜" "laohang"        "pugbb"          "健brad70208"   
## [17] "殘廁花花手袋"   "UNBIASEDA"

Now we need remove the white spaces (trim):

library(stringr)
authors = str_trim(authors)
authors
##  [1] "Rebirth7"       "今天綠色羔羊"   "gavinsfxs"      "井底委屈的世界"
##  [5] "stkwok1985"     "東方大頭妹"     "高達原始的水管" "煙hai"         
##  [9] "石子宏"         "井底進擊的牛牛" "香港鶴雞輸甩褲" "fake06"        
## [13] "袋中野生的菠菜" "laohang"        "pugbb"          "健brad70208"   
## [17] "殘廁花花手袋"   "UNBIASEDA"

To extract titles and urls:

titles = xpathSApply(page,"//tbody[contains(@id,'normalthread')]//span[@class='tsubject']/a",xmlValue)
titles
##  [1] "  【抗暴26周】終院高院門外遭縱火 目擊者質疑有人扮示威者擲汽油彈"                                                                          
##  [2] "  暴徒火燒法院 縱暴派拒譴責 - 香港文匯報"                                                                                                 
##  [3] "  【逃犯條例】警員罵遊行人士江永祥稱聽到斥示威者也粗口辱警"                                                                               
##  [4] "  <U+FEFF>暴亂傷城系列之二店危居險 | 黑暴瘋狂燒店 險殺樓上居民"                                                                           
##  [5] "  【逃犯條例】因政見遭無理解僱民間自研「搵工平台」助求職配對"                                                                             
##  [6] "  香港華仁書院發現土製炸彈警稱可致大量死傷 - RTHK"                                                                                        
##  [7] "  【12.8遊行】高院終院遭縱火律政司:絕不容忍破壞司法機構行為"                                                                             
##  [8] "  中國最新磁浮列車曝光無人駕駛時速600公里杭州去上海20分鐘"                                                                                
##  [9] "  旺季變寒冬 零售業料裁5600人"                                                                                                            
## [10] "  【Life goes on】黑暴打殘香港 旅遊業大佬葉慶寧:89年開業到現在,香港經歷金融海嘯、沙士等難關、沒有認輸,只要社會回復安寧、香港必重現光輝"
## [11] "  報復美方制裁傳北京下令3年淘汰外國電腦設備"                                                                                              
## [12] "  【法國大罷工】高登兩記者中突圍彈受傷     聲明要求馬克龍譴責警察"                                                                        
## [13] "  港華書院外藏兩飛釘炸彈"                                                                                                                 
## [14] "  方仲賢斥台灣民進黨:用港人鮮血換選票"                                                                                                   
## [15] "  法援所限要換律師  被告嘆612不支援  基金:無法獲心儀代表 非安全網「穿窿」 - 20191210 - 要聞"                                             
## [16] "  【諸行無常】聽聽少年一席話"                                                                                                             
## [17] "  【香港人權與民主法案】歐盟啟動歐版《馬格尼茨基法案》 立法程序 - 香港經濟日報 - 即時新聞頻道 - 國際形勢 - 環球經濟金融"                  
## [18] "  【黑暴「孤狼」】法院遭暴徒縱火 退休法官:反應暴徒行事魯莽"
urls = xpathSApply(page,"//tbody[contains(@id,'normalthread')]//span[@class='tsubject']/a/@href")
urls
##                                                                                href 
## "viewthread.php?tid=28745079&extra=page%3D1&tr_h=12514647595def510d9c44a9_16606512" 
##                                                                                href 
## "viewthread.php?tid=28744831&extra=page%3D1&tr_h=12514647595def510d9c44a9_16606512" 
##                                                                                href 
## "viewthread.php?tid=28745421&extra=page%3D1&tr_h=12514647595def510d9c44a9_16606512" 
##                                                                                href 
## "viewthread.php?tid=28745061&extra=page%3D1&tr_h=12514647595def510d9c44a9_16606512" 
##                                                                                href 
## "viewthread.php?tid=28743844&extra=page%3D1&tr_h=12514647595def510d9c44a9_16606512" 
##                                                                                href 
## "viewthread.php?tid=28745173&extra=page%3D1&tr_h=12514647595def510d9c44a9_16606512" 
##                                                                                href 
## "viewthread.php?tid=28745153&extra=page%3D1&tr_h=12514647595def510d9c44a9_16606512" 
##                                                                                href 
## "viewthread.php?tid=28744250&extra=page%3D1&tr_h=12514647595def510d9c44a9_16606512" 
##                                                                                href 
## "viewthread.php?tid=28744911&extra=page%3D1&tr_h=12514647595def510d9c44a9_16606512" 
##                                                                                href 
## "viewthread.php?tid=28744883&extra=page%3D1&tr_h=12514647595def510d9c44a9_16606512" 
##                                                                                href 
## "viewthread.php?tid=28743282&extra=page%3D1&tr_h=12514647595def510d9c44a9_16606512" 
##                                                                                href 
## "viewthread.php?tid=28744229&extra=page%3D1&tr_h=12514647595def510d9c44a9_16606512" 
##                                                                                href 
## "viewthread.php?tid=28744855&extra=page%3D1&tr_h=12514647595def510d9c44a9_16606512" 
##                                                                                href 
## "viewthread.php?tid=28745637&extra=page%3D1&tr_h=12514647595def510d9c44a9_16606512" 
##                                                                                href 
## "viewthread.php?tid=28744762&extra=page%3D1&tr_h=12514647595def510d9c44a9_16606512" 
##                                                                                href 
## "viewthread.php?tid=28744900&extra=page%3D1&tr_h=12514647595def510d9c44a9_16606512" 
##                                                                                href 
## "viewthread.php?tid=28746151&extra=page%3D1&tr_h=12514647595def510d9c44a9_16606512" 
##                                                                                href 
## "viewthread.php?tid=28744545&extra=page%3D1&tr_h=12514647595def510d9c44a9_16606512"

Put them together:

library(stringr,quietly = T)
url="https://news.discuss.com.hk/forumdisplay.php?fid=54&threadMode=neutral"

scrape = function(url){
  source=getURL(url,.encoding='utf-8')
  #source=iconv(source, "big5", "UTF-8",sub = 'byte') 
  page =htmlParse(source,encoding = 'UTF-8')
  authors = str_trim(xpathSApply(page,"//tbody[contains(@id,'normalthread')]//td[@class='author']/cite",xmlValue))
  titles = xpathSApply(page,"//tbody[contains(@id,'normalthread')]//span[@class='tsubject']/a",xmlValue)
  urls = xpathSApply(page,"//tbody[contains(@id,'normalthread')]//span[@class='tsubject']/a/@href")
  df = data.frame(authors,titles,urls,stringsAsFactors = F)
  
  nexp = xpathSApply(page,"//a[@class='next']/@href")[1]
  nexp = paste0("https://news.discuss.com.hk/",nexp)
  return(list(df,nexp))
}

k=scrape(url)
k[[1]]
##           authors
## 1        Rebirth7
## 2    今天綠色羔羊
## 3       gavinsfxs
## 4  井底委屈的世界
## 5      stkwok1985
## 6      東方大頭妹
## 7  高達原始的水管
## 8           煙hai
## 9          石子宏
## 10 井底進擊的牛牛
## 11 香港鶴雞輸甩褲
## 12         fake06
## 13 袋中野生的菠菜
## 14        laohang
## 15          pugbb
## 16    健brad70208
## 17   殘廁花花手袋
## 18      UNBIASEDA
##                                                                                                                                       titles
## 1                                                                              【抗暴26周】終院高院門外遭縱火 目擊者質疑有人扮示威者擲汽油彈
## 2                                                                                                     暴徒火燒法院 縱暴派拒譴責 - 香港文匯報
## 3                                                                                   【逃犯條例】警員罵遊行人士江永祥稱聽到斥示威者也粗口辱警
## 4                                                                               <U+FEFF>暴亂傷城系列之二店危居險 | 黑暴瘋狂燒店 險殺樓上居民
## 5                                                                                 【逃犯條例】因政見遭無理解僱民間自研「搵工平台」助求職配對
## 6                                                                                            香港華仁書院發現土製炸彈警稱可致大量死傷 - RTHK
## 7                                                                                 【12.8遊行】高院終院遭縱火律政司:絕不容忍破壞司法機構行為
## 8                                                                                    中國最新磁浮列車曝光無人駕駛時速600公里杭州去上海20分鐘
## 9                                                                                                                旺季變寒冬 零售業料裁5600人
## 10   【Life goes on】黑暴打殘香港 旅遊業大佬葉慶寧:89年開業到現在,香港經歷金融海嘯、沙士等難關、沒有認輸,只要社會回復安寧、香港必重現光輝
## 11                                                                                                 報復美方制裁傳北京下令3年淘汰外國電腦設備
## 12                                                                           【法國大罷工】高登兩記者中突圍彈受傷     聲明要求馬克龍譴責警察
## 13                                                                                                                    港華書院外藏兩飛釘炸彈
## 14                                                                                                      方仲賢斥台灣民進黨:用港人鮮血換選票
## 15                                                法援所限要換律師  被告嘆612不支援  基金:無法獲心儀代表 非安全網「穿窿」 - 20191210 - 要聞
## 16                                                                                                                【諸行無常】聽聽少年一席話
## 17                     【香港人權與民主法案】歐盟啟動歐版《馬格尼茨基法案》 立法程序 - 香港經濟日報 - 即時新聞頻道 - 國際形勢 - 環球經濟金融
## 18                                                                                 【黑暴「孤狼」】法院遭暴徒縱火 退休法官:反應暴徒行事魯莽
##                                                                                 urls
## 1  viewthread.php?tid=28745079&extra=page%3D1&tr_h=21387056885def510e13bd39_66144590
## 2  viewthread.php?tid=28744831&extra=page%3D1&tr_h=21387056885def510e13bd39_66144590
## 3  viewthread.php?tid=28745421&extra=page%3D1&tr_h=21387056885def510e13bd39_66144590
## 4  viewthread.php?tid=28745061&extra=page%3D1&tr_h=21387056885def510e13bd39_66144590
## 5  viewthread.php?tid=28743844&extra=page%3D1&tr_h=21387056885def510e13bd39_66144590
## 6  viewthread.php?tid=28745173&extra=page%3D1&tr_h=21387056885def510e13bd39_66144590
## 7  viewthread.php?tid=28745153&extra=page%3D1&tr_h=21387056885def510e13bd39_66144590
## 8  viewthread.php?tid=28744250&extra=page%3D1&tr_h=21387056885def510e13bd39_66144590
## 9  viewthread.php?tid=28744911&extra=page%3D1&tr_h=21387056885def510e13bd39_66144590
## 10 viewthread.php?tid=28744883&extra=page%3D1&tr_h=21387056885def510e13bd39_66144590
## 11 viewthread.php?tid=28743282&extra=page%3D1&tr_h=21387056885def510e13bd39_66144590
## 12 viewthread.php?tid=28744229&extra=page%3D1&tr_h=21387056885def510e13bd39_66144590
## 13 viewthread.php?tid=28744855&extra=page%3D1&tr_h=21387056885def510e13bd39_66144590
## 14 viewthread.php?tid=28745637&extra=page%3D1&tr_h=21387056885def510e13bd39_66144590
## 15 viewthread.php?tid=28744762&extra=page%3D1&tr_h=21387056885def510e13bd39_66144590
## 16 viewthread.php?tid=28744900&extra=page%3D1&tr_h=21387056885def510e13bd39_66144590
## 17 viewthread.php?tid=28746151&extra=page%3D1&tr_h=21387056885def510e13bd39_66144590
## 18 viewthread.php?tid=28744545&extra=page%3D1&tr_h=21387056885def510e13bd39_66144590
head(k[[2]])
## [1] "https://news.discuss.com.hk/forumdisplay.php?fid=54&threadMode=neutral&page=2"

Loop to get more pages:

frdata=data.frame()
n=0
while (!is.null(url)&n<=10){
  k = scrape(url)
  frdata = bind_rows(frdata,k[[1]])
  url = k[[2]]
  n=n+1
  print(n)
}
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9
## [1] 10
## [1] 11
dim(frdata)
## [1] 268   3

3. Using Automated Browser