Knowledge Mining: Text mining

File: textdata_mining02.R

Theme: Download Twitter data for network and sentiment analyses

Load packages

library(rtweet)
## Warning: package 'rtweet' was built under R version 4.0.4
library(igraph)
## Warning: package 'igraph' was built under R version 4.0.4
## 
## Attaching package: 'igraph'
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.0.4
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.1.0     v dplyr   1.0.5
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## Warning: package 'ggplot2' was built under R version 4.0.3
## Warning: package 'tidyr' was built under R version 4.0.4
## Warning: package 'readr' was built under R version 4.0.3
## Warning: package 'dplyr' was built under R version 4.0.4
## Warning: package 'forcats' was built under R version 4.0.3
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::as_data_frame() masks tibble::as_data_frame(), igraph::as_data_frame()
## x purrr::compose()       masks igraph::compose()
## x tidyr::crossing()      masks igraph::crossing()
## x dplyr::filter()        masks stats::filter()
## x purrr::flatten()       masks rtweet::flatten()
## x dplyr::groups()        masks igraph::groups()
## x dplyr::lag()           masks stats::lag()
## x purrr::simplify()      masks igraph::simplify()
library(ggraph)
## Warning: package 'ggraph' was built under R version 4.0.4
library(data.table)
## Warning: package 'data.table' was built under R version 4.0.4
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## The following object is masked from 'package:purrr':
## 
##     transpose

Acquire API key and token from Twitter developer website

# Check https://datageneration.org/adp/twitter/ for detail

# Create token for direct authentication to access Twitter data
# Enter key and tokens from Twitter developer account 
# (Developer Portal--> Projects & Apps --> under Apps, Keys and tokens)
twitter_token <- rtweet::create_token(
app="Data methods Web data",
consumer_key <- "3aBeG3yYXjdafXgUBGt5RDK7N",
consumer_secret <- "QMBowAbJgRVGbYr1kuMuZIf2viqIwpanh5GoAzGqk3JgxMORcy",
access_token_key <- "785910745506521088-xmkYhshGvP5CBSnAGGhWWTqpL225Ssu",
access_secret <- "32AIv4dEkzNFpSGjsUWsDWzdOeElRZgMvwQz5Nme71gpL")

search for 1000 tweets by keyword in English

jb <- rtweet::search_tweets(q = "JoeBiden", n = 1000, lang = "en", token = twitter_token)

preview users data

users_data(jb)
## # A tibble: 943 x 20
##    user_id   screen_name  name        location description      url    protected
##    <chr>     <chr>        <chr>       <chr>    <chr>            <chr>  <lgl>    
##  1 29472026~ heavyed65    "Eddie DiF~ ""       "Activist for a~ <NA>   FALSE    
##  2 29472026~ heavyed65    "Eddie DiF~ ""       "Activist for a~ <NA>   FALSE    
##  3 12270465~ MikeRecon_0~ "<U+270A>Foxy Mik~ "Pomona~ "U.S. Marine, U~ <NA>   FALSE    
##  4 13310139~ Dana32019207 "Dana"      ""       "I promote trut~ <NA>   FALSE    
##  5 465316012 JoniMangare~ "INTERPRET" "Brookl~ ""               https~ FALSE    
##  6 878511020 shay_nurse   "Sharon De~ "Newnan~ "Love walking, ~ <NA>   FALSE    
##  7 12664027~ kiranjohnso~ "Kiran"     ""       ""               <NA>   FALSE    
##  8 23465332~ JWC3D        "James Cox" ""       ""               <NA>   FALSE    
##  9 23465332~ JWC3D        "James Cox" ""       ""               <NA>   FALSE    
## 10 23465332~ JWC3D        "James Cox" ""       ""               <NA>   FALSE    
## # ... with 933 more rows, and 13 more variables: followers_count <int>,
## #   friends_count <int>, listed_count <int>, statuses_count <int>,
## #   favourites_count <int>, account_created_at <dttm>, verified <lgl>,
## #   profile_url <chr>, profile_expanded_url <chr>, account_lang <lgl>,
## #   profile_banner_url <chr>, profile_background_url <chr>,
## #   profile_image_url <chr>

Get tweets by userid

jb_timeline = get_timelines("JoeBiden", n = 1000)

plot time series of tweets frequency

ts_plot(jb_timeline, by = "days") + theme_bw()

Boolean search for large quantity of tweets (which could take a while)

jb1 <- rtweet::search_tweets("JoeBiden OR president OR potus", n = 100,
                             retryonratelimit = TRUE)

Graphing retweet connections

## Warning: could take some time to create the igraph
## Suggestion: start from a smaller data file
## Credit: Russell, Matthew. 2018. 21 Recipes for Mining Twitter Data with rtweet
## https://rud.is/books/21-recipes/visualizing-a-graph-of-retweet-relationships.html

Create igraph object from Twitter data using user id and mentioned id.

ggraph draws the network graph in different layouts (12).

filter(jb, retweet_count > 0 ) %>% 
  select(screen_name, mentions_screen_name) %>%
  unnest(mentions_screen_name) %>% 
  filter(!is.na(mentions_screen_name)) %>% 
  graph_from_data_frame() -> jb_g
V(jb_g)$node_label <- unname(ifelse(degree(jb_g)[V(jb_g)] > 20, names(V(jb_g)), "")) 
V(jb_g)$node_size <- unname(ifelse(degree(jb_g)[V(jb_g)] > 20, degree(jb_g), 0))
# ggraph layouts: 'star', 'circle', 'gem', 'dh', 'graphopt', 'grid', 'mds', 
# 'randomly', 'fr', 'kk', 'drl', 'lgl'
# Davidson-Harel algorithm
# Try also fr (fruchterman reingold)
ggraph(jb_g, layout = 'dh') + 
  geom_edge_arc(edge_width=0.1, aes(alpha=..index..)) +
  geom_node_label(aes(label=node_label, size=node_size),
                  label.size=0, fill="#ffffff66", segment.colour="light blue",
                  color="red", repel=TRUE) +
  coord_fixed() +
  scale_size_area(trans="sqrt") +
  labs(title="Joe Biden Twitter Plot", subtitle="Edges=volume of retweets. Screenname size=influence") +
  theme_bw() +
  theme(legend.position="none")
## Warning: ggrepel: 7 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Kamala Harris’s tweets

search for 100 tweets by keyword in English

kh <- rtweet::search_tweets(q = "KamalaHarris", n = 100, lang = "en", token = twitter_token)

preview users data

users_data(kh)
## # A tibble: 96 x 20
##    user_id  screen_name  name     location      description      url   protected
##    <chr>    <chr>        <chr>    <chr>         <chr>            <chr> <lgl>    
##  1 8725453~ midwifeMary~ "Mary L~ "\U0001f1ee\~ "<U+2618><U+FE0F>\U0001f1ee\U~ <NA>  FALSE    
##  2 1267183~ phortuenti   "bread ~ ""            "We will know o~ <NA>  FALSE    
##  3 4632389~ fbravo56     "Fbravo~ ""            ""               <NA>  FALSE    
##  4 2454936~ rolltak      "awkwar~ ""            "In the words o~ <NA>  FALSE    
##  5 1369946~ Ihaveadream~ "Countr~ ""            "Freedom!!"      <NA>  FALSE    
##  6 1247640~ bestbiafra   "Bestbi~ ""            "BIAFRAN *Freed~ <NA>  FALSE    
##  7 1004141~ LuisaColomb~ "LuisaM~ "USAQUEN"     "#IamJewis #Rev~ <NA>  FALSE    
##  8 1004141~ LuisaColomb~ "LuisaM~ "USAQUEN"     "#IamJewis #Rev~ <NA>  FALSE    
##  9 1004141~ LuisaColomb~ "LuisaM~ "USAQUEN"     "#IamJewis #Rev~ <NA>  FALSE    
## 10 9744193~ cipherEqual~ "Cipher~ "Now"         "Celebrating th~ <NA>  FALSE    
## # ... with 86 more rows, and 13 more variables: followers_count <int>,
## #   friends_count <int>, listed_count <int>, statuses_count <int>,
## #   favourites_count <int>, account_created_at <dttm>, verified <lgl>,
## #   profile_url <chr>, profile_expanded_url <chr>, account_lang <lgl>,
## #   profile_banner_url <chr>, profile_background_url <chr>,
## #   profile_image_url <chr>

Get tweets by userid

kh_timeline = get_timelines("KamalaHarris", n = 100)

plot time series of tweets frequency

ts_plot(kh_timeline, by = "days") + theme_bw()

Graphing retweet connections

Create igraph object from Twitter data using user id and mentioned id.

ggraph draws the network graph in different layouts (12).

filter(kh, retweet_count > 0 ) %>% 
  select(screen_name, mentions_screen_name) %>%
  unnest(mentions_screen_name) %>% 
  filter(!is.na(mentions_screen_name)) %>% 
  graph_from_data_frame() -> kh_g
V(kh_g)$node_label <- unname(ifelse(degree(kh_g)[V(kh_g)] > 20, names(V(kh_g)), "")) 
V(kh_g)$node_size <- unname(ifelse(degree(kh_g)[V(kh_g)] > 20, degree(kh_g), 0)) 
# ggraph layouts: 'star', 'circle', 'gem', 'dh', 'graphopt', 'grid', 'mds', 
# 'randomly', 'fr', 'kk', 'drl', 'lgl'
# Davidson-Harel algorithm
# Try also fr (fruchterman reingold)
ggraph(kh_g, layout = 'dh') + 
  geom_edge_arc(edge_width=0.1, aes(alpha=..index..)) +
  geom_node_label(aes(label=node_label, size=node_size),
                  label.size=0, fill="#ffffff66", segment.colour="light blue",
                  color="red", repel=TRUE) +
  coord_fixed() +
  scale_size_area(trans="sqrt") +
  labs(title="Kamala Harris Twitter Plot", subtitle="Edges=volume of retweets. Screenname size=influence") +
  theme_bw() +
  theme(legend.position="none")