Knowledge Mining: Text mining

File: textdata_mining02.R

Theme: Download Twitter data for network and sentiment analyses

Load packages

library(rtweet)

## Warning: package 'rtweet' was built under R version 4.0.4

library(igraph)

## Warning: package 'igraph' was built under R version 4.0.4

## 
## Attaching package: 'igraph'

## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum

## The following object is masked from 'package:base':
## 
##     union

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.0.4

## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --

## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.1.0     v dplyr   1.0.5
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1

## Warning: package 'ggplot2' was built under R version 4.0.3

## Warning: package 'tidyr' was built under R version 4.0.4

## Warning: package 'readr' was built under R version 4.0.3

## Warning: package 'dplyr' was built under R version 4.0.4

## Warning: package 'forcats' was built under R version 4.0.3

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::as_data_frame() masks tibble::as_data_frame(), igraph::as_data_frame()
## x purrr::compose()       masks igraph::compose()
## x tidyr::crossing()      masks igraph::crossing()
## x dplyr::filter()        masks stats::filter()
## x purrr::flatten()       masks rtweet::flatten()
## x dplyr::groups()        masks igraph::groups()
## x dplyr::lag()           masks stats::lag()
## x purrr::simplify()      masks igraph::simplify()

library(ggraph)

## Warning: package 'ggraph' was built under R version 4.0.4

library(data.table)

## Warning: package 'data.table' was built under R version 4.0.4

## 
## Attaching package: 'data.table'

## The following objects are masked from 'package:dplyr':
## 
##     between, first, last

## The following object is masked from 'package:purrr':
## 
##     transpose

Acquire API key and token from Twitter developer website

# Check https://datageneration.org/adp/twitter/ for detail

# Create token for direct authentication to access Twitter data
# Enter key and tokens from Twitter developer account 
# (Developer Portal--> Projects & Apps --> under Apps, Keys and tokens)
twitter_token <- rtweet::create_token(
app="Data methods Web data",
consumer_key <- "3aBeG3yYXjdafXgUBGt5RDK7N",
consumer_secret <- "QMBowAbJgRVGbYr1kuMuZIf2viqIwpanh5GoAzGqk3JgxMORcy",
access_token_key <- "785910745506521088-xmkYhshGvP5CBSnAGGhWWTqpL225Ssu",
access_secret <- "32AIv4dEkzNFpSGjsUWsDWzdOeElRZgMvwQz5Nme71gpL")

search for 1000 tweets by keyword in English

jb <- rtweet::search_tweets(q = "JoeBiden", n = 1000, lang = "en", token = twitter_token)

preview users data

users_data(jb)

## # A tibble: 943 x 20
##    user_id   screen_name  name        location description      url    protected
##    <chr>     <chr>        <chr>       <chr>    <chr>            <chr>  <lgl>    
##  1 29472026~ heavyed65    "Eddie DiF~ ""       "Activist for a~ <NA>   FALSE    
##  2 29472026~ heavyed65    "Eddie DiF~ ""       "Activist for a~ <NA>   FALSE    
##  3 12270465~ MikeRecon_0~ "<U+270A>Foxy Mik~ "Pomona~ "U.S. Marine, U~ <NA>   FALSE    
##  4 13310139~ Dana32019207 "Dana"      ""       "I promote trut~ <NA>   FALSE    
##  5 465316012 JoniMangare~ "INTERPRET" "Brookl~ ""               https~ FALSE    
##  6 878511020 shay_nurse   "Sharon De~ "Newnan~ "Love walking, ~ <NA>   FALSE    
##  7 12664027~ kiranjohnso~ "Kiran"     ""       ""               <NA>   FALSE    
##  8 23465332~ JWC3D        "James Cox" ""       ""               <NA>   FALSE    
##  9 23465332~ JWC3D        "James Cox" ""       ""               <NA>   FALSE    
## 10 23465332~ JWC3D        "James Cox" ""       ""               <NA>   FALSE    
## # ... with 933 more rows, and 13 more variables: followers_count <int>,
## #   friends_count <int>, listed_count <int>, statuses_count <int>,
## #   favourites_count <int>, account_created_at <dttm>, verified <lgl>,
## #   profile_url <chr>, profile_expanded_url <chr>, account_lang <lgl>,
## #   profile_banner_url <chr>, profile_background_url <chr>,
## #   profile_image_url <chr>

Get tweets by userid

jb_timeline = get_timelines("JoeBiden", n = 1000)

plot time series of tweets frequency

ts_plot(jb_timeline, by = "days") + theme_bw()

Boolean search for large quantity of tweets (which could take a while)

jb1 <- rtweet::search_tweets("JoeBiden OR president OR potus", n = 100,
                             retryonratelimit = TRUE)

Graphing retweet connections

## Warning: could take some time to create the igraph
## Suggestion: start from a smaller data file
## Credit: Russell, Matthew. 2018. 21 Recipes for Mining Twitter Data with rtweet
## https://rud.is/books/21-recipes/visualizing-a-graph-of-retweet-relationships.html

Create igraph object from Twitter data using user id and mentioned id.

ggraph draws the network graph in different layouts (12).

filter(jb, retweet_count > 0 ) %>% 
  select(screen_name, mentions_screen_name) %>%
  unnest(mentions_screen_name) %>% 
  filter(!is.na(mentions_screen_name)) %>% 
  graph_from_data_frame() -> jb_g
V(jb_g)$node_label <- unname(ifelse(degree(jb_g)[V(jb_g)] > 20, names(V(jb_g)), "")) 
V(jb_g)$node_size <- unname(ifelse(degree(jb_g)[V(jb_g)] > 20, degree(jb_g), 0))

# ggraph layouts: 'star', 'circle', 'gem', 'dh', 'graphopt', 'grid', 'mds', 
# 'randomly', 'fr', 'kk', 'drl', 'lgl'
# Davidson-Harel algorithm
# Try also fr (fruchterman reingold)

ggraph(jb_g, layout = 'dh') + 
  geom_edge_arc(edge_width=0.1, aes(alpha=..index..)) +
  geom_node_label(aes(label=node_label, size=node_size),
                  label.size=0, fill="#ffffff66", segment.colour="light blue",
                  color="red", repel=TRUE) +
  coord_fixed() +
  scale_size_area(trans="sqrt") +
  labs(title="Joe Biden Twitter Plot", subtitle="Edges=volume of retweets. Screenname size=influence") +
  theme_bw() +
  theme(legend.position="none")

## Warning: ggrepel: 7 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Kamala Harris’s tweets

search for 100 tweets by keyword in English

kh <- rtweet::search_tweets(q = "KamalaHarris", n = 100, lang = "en", token = twitter_token)

preview users data

users_data(kh)

## # A tibble: 96 x 20
##    user_id  screen_name  name     location      description      url   protected
##    <chr>    <chr>        <chr>    <chr>         <chr>            <chr> <lgl>    
##  1 8725453~ midwifeMary~ "Mary L~ "\U0001f1ee\~ "<U+2618><U+FE0F>\U0001f1ee\U~ <NA>  FALSE    
##  2 1267183~ phortuenti   "bread ~ ""            "We will know o~ <NA>  FALSE    
##  3 4632389~ fbravo56     "Fbravo~ ""            ""               <NA>  FALSE    
##  4 2454936~ rolltak      "awkwar~ ""            "In the words o~ <NA>  FALSE    
##  5 1369946~ Ihaveadream~ "Countr~ ""            "Freedom!!"      <NA>  FALSE    
##  6 1247640~ bestbiafra   "Bestbi~ ""            "BIAFRAN *Freed~ <NA>  FALSE    
##  7 1004141~ LuisaColomb~ "LuisaM~ "USAQUEN"     "#IamJewis #Rev~ <NA>  FALSE    
##  8 1004141~ LuisaColomb~ "LuisaM~ "USAQUEN"     "#IamJewis #Rev~ <NA>  FALSE    
##  9 1004141~ LuisaColomb~ "LuisaM~ "USAQUEN"     "#IamJewis #Rev~ <NA>  FALSE    
## 10 9744193~ cipherEqual~ "Cipher~ "Now"         "Celebrating th~ <NA>  FALSE    
## # ... with 86 more rows, and 13 more variables: followers_count <int>,
## #   friends_count <int>, listed_count <int>, statuses_count <int>,
## #   favourites_count <int>, account_created_at <dttm>, verified <lgl>,
## #   profile_url <chr>, profile_expanded_url <chr>, account_lang <lgl>,
## #   profile_banner_url <chr>, profile_background_url <chr>,
## #   profile_image_url <chr>

Get tweets by userid

kh_timeline = get_timelines("KamalaHarris", n = 100)

plot time series of tweets frequency

ts_plot(kh_timeline, by = "days") + theme_bw()

Graphing retweet connections

Create igraph object from Twitter data using user id and mentioned id.

ggraph draws the network graph in different layouts (12).

filter(kh, retweet_count > 0 ) %>% 
  select(screen_name, mentions_screen_name) %>%
  unnest(mentions_screen_name) %>% 
  filter(!is.na(mentions_screen_name)) %>% 
  graph_from_data_frame() -> kh_g
V(kh_g)$node_label <- unname(ifelse(degree(kh_g)[V(kh_g)] > 20, names(V(kh_g)), "")) 
V(kh_g)$node_size <- unname(ifelse(degree(kh_g)[V(kh_g)] > 20, degree(kh_g), 0))

# ggraph layouts: 'star', 'circle', 'gem', 'dh', 'graphopt', 'grid', 'mds', 
# 'randomly', 'fr', 'kk', 'drl', 'lgl'
# Davidson-Harel algorithm
# Try also fr (fruchterman reingold)

ggraph(kh_g, layout = 'dh') + 
  geom_edge_arc(edge_width=0.1, aes(alpha=..index..)) +
  geom_node_label(aes(label=node_label, size=node_size),
                  label.size=0, fill="#ffffff66", segment.colour="light blue",
                  color="red", repel=TRUE) +
  coord_fixed() +
  scale_size_area(trans="sqrt") +
  labs(title="Kamala Harris Twitter Plot", subtitle="Edges=volume of retweets. Screenname size=influence") +
  theme_bw() +
  theme(legend.position="none")

Assignment 6

Federico Ferrero

3/24/2021

Knowledge Mining: Text mining

File: textdata_mining02.R

Theme: Download Twitter data for network and sentiment analyses

Load packages

Acquire API key and token from Twitter developer website

search for 1000 tweets by keyword in English

preview users data

Get tweets by userid

plot time series of tweets frequency

Boolean search for large quantity of tweets (which could take a while)

Graphing retweet connections

Create igraph object from Twitter data using user id and mentioned id.

ggraph draws the network graph in different layouts (12).

Kamala Harris’s tweets

search for 100 tweets by keyword in English

preview users data

Get tweets by userid

plot time series of tweets frequency

Graphing retweet connections

Create igraph object from Twitter data using user id and mentioned id.

ggraph draws the network graph in different layouts (12).