Graphic by Heikki Ritaluoma

Welcome

This GitHub page includes explorations of the former prime minister of Finland Alexander Stubb’s tweets, retreived from the twitter API on Jan 2016 and ananalyzed using the R programming language for statistical analysis. The codes for the analysis are available at the StubbTweets repository. An article including the graphics displayed here can be found here (in finnish).

The data

glimpse(tw)
Observations: 1,740
Variables: 20
$ text          <chr> "Tässä viimeisin Pyöräily+Triathlon kolumni. Mal...
$ favorited     <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,...
$ favoriteCount <dbl> 30, 11, 19, 46, 17, 26, 23, 16, 14, 6, 21, 10, 1...
$ replyToSN     <chr> NA, "Lagarde", "TuomasEnbuske", NA, NA, NA, NA, ...
$ created       <dttm> 2016-02-19 20:56:03, 2016-02-19 18:44:48, 2016-...
$ truncated     <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,...
$ replyToSID    <chr> NA, "700722045529755648", "700474680575180800", ...
$ id            <dbl> 7.007557e+17, 7.007227e+17, 7.006924e+17, 7.0067...
$ replyToUID    <chr> NA, "304909941", "420281182", NA, NA, NA, NA, NA...
$ statusSource  <chr> "<a href=\"http://twitter.com/download/iphone\" ...
$ screenName    <chr> "alexstubb", "alexstubb", "alexstubb", "alexstub...
$ retweetCount  <dbl> 1, 7, 1, 2, 2, 23, 5, 4, 5, 4, 0, 3, 13, 9, 8, 1...
$ isRetweet     <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,...
$ retweeted     <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,...
$ longitude     <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
$ latitude      <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
$ popularity    <dbl> 31, 18, 20, 48, 19, 49, 28, 20, 19, 10, 21, 13, ...
$ tweet         <chr> "tässä viimeisin pyöräilytriathlon kolumni malti...
$ lang          <chr> "finnish", "english", "swedish", "finnish", "fin...
$ tunti         <dbl> 20, 18, 16, 15, 13, 9, 21, 21, 21, 18, 11, 11, 1...

Monthly tweets

par(mar=c(8,9,6,4))
h <- hist(tw$created, main = paste(tweeter, "tweets by month"),
     breaks="month", freq=T, 
     xlab="",ylab="", las=2,
     labels=T, cex.axis=1.3, cex.lab=1.5,
     ylim= c(0,400), format="%Y-%m",
     col="cadetblue3")
mtext(side = 2, text = "tweets", line = 5, cex = 1.5)

Daily tweets

par(mar=c(7,9,5,3))
h <- hist(tw$created, main = paste(tweeter, "tweets by day"),
          breaks="days", freq=T, 
          xlab="", ylab="",col="grey55", lty=0,
          cex.axis=1.3 ,cex.lab=1.5, tck=0.05, xaxt="n")
peaks=2
tickpos <- h$breaks[order(h$counts,decreasing=T)[1:peaks]]
labels <- names(sort(table(format(tw$created,"%d.%m.%Y")),decreasing=T))[1:peaks]
axis(1, at=tickpos, labels=labels,cex.axis=1.5)
mtext(side = 2, text = "tweets", line = 5, cex = 1.5)
mtext(side = 1, text = "day", line = 3, cex = 1.5)

Hourly tweets

hourly <- table(format(tw$created,"%H"))
hourly_prc <- paste(round(100*hourly/sum(hourly),1),"%")
par(mar=c(8,10,5,3))
bp <- barplot(hourly, main = paste(tweeter, "tweets by hour"),
              space=0.5, ylim=c(0,250),
              ylab="",cex.lab=1, cex.names = 1.2,
              cex.axis=1.2, las=2, col = "deepskyblue3",
              xlab="")
text(bp+0.1, hourly, hourly_prc, pos=3, cex=0.7) 
mtext(side = 2, text = "tweets", line = 5, cex = 1.3)
mtext(side = 1, text = "time", line = 5, cex = 1.3)

Most active tweet days

top5tweetdays <- sort(table(Date = format(tw$created,"%Y-%m-%d")),decreasing=T)[1:5]
data.frame(top5tweetdays)
Date Freq
2015-12-16 43
2015-04-18 36
2015-04-01 22
2015-04-16 22
2015-03-10 21

Tweet times during the most active days

top5days <- names(top5tweetdays)

topdaydata <- lapply(1:4,function(day) {
  get_datedata(tw, top5days[day])
})

newpar <- par(mfrow=c(2,2))
for(i in 1:4){
  time <- topdaydata[[i]]$created
  hist(time, breaks=100,freq=T,border=NULL,ylab="Tweets",
       tck=0,cex.axis=0.8, cex.lab=0.8, xlab="",
       main=paste0(tweeter," ",top5days[i]),
       cex.main=1, ylim=c(0,8))
}

Most used hashtags

hashcounts <- extract_hashes(tw$text)
hash <- head(hashcounts, 5)
kable(hash)
tag freq
#kokoomus 87
#vaalit2015 78
#eurogroup 62
#työnkautta 38
#euco 24
par(las=2, mar=c(5,14,4,2))
barplot(hash$freq, horiz=T, names.arg=hash$tag,
        cex.names=1.3, cex.axis = 1.5,
        col = "skyblue", border=NA)

Most used words

# document term matrices for each language
DTM <- get(load(paste0("data/",tweeter,"_DTM.Rda")))

# word frequencies for each language
FREQ <- lapply(DTM, twitter_wordfreqs)
names(FREQ) <- names(DTM)

# wordclouds
par(mfrow= c(3,1))
temp <- lapply(FREQ, function(lang) {
  suppressWarnings(
    wordcloud(lang$word, lang$freq,
            scale = c(5,1),
            random.order=FALSE, colors=brewer.pal(8, "Dark2")))
})

Topic model

TW <- get(load(paste0("data/",tweeter,"_topicdata.Rda")))
tw_fi <- TW[[2]]
topic_labels <- c("Kannanottoja", "SuomiNousuun","KookoomusTsemppi","Kansanviestit")
tw_fi[["aihe"]] <- factor(tw_fi$topic, labels = topic_labels)
tw_fi <- tw_fi[!is.na(tw_fi$aihe),]

Average popularity by topic

library(dplyr)
group_by(tw_fi, aihe) %>% 
  summarise(keskisuosio = round(mean(suosio)), mediaanisuosio = round(quantile(suosio, probs=0.5)))
aihe keskisuosio mediaanisuosio
Kannanottoja 50 34
SuomiNousuun 60 33
KookoomusTsemppi 64 28
Kansanviestit 49 24

Number of tweets by month and popularity

# popularity by month
df_summary <- get_summary(tw_fi, "kuukausi")
q <- ggplot(df_summary,aes(aika,tweets, size=keski_suosio)) + geom_point() 
q <- q + ylab("tweettejä") 
q <- q + scale_x_discrete() + xlab("")
q <- q + theme(axis.text.x=element_text(size = 10,angle = -90, hjust = 0),
               axis.text.y = element_text(size = 15),
               axis.title.y = element_text(size=15),
               legend.text = element_text(size = 15),
               legend.title = element_text(size=10))
q

Popularity by topics (log scale)

# help funtion for scale_y_log10
fmt <- function(){
  function(x) format(x, nsmall=0L, scientific = FALSE)
}

q <-ggplot(tw_fi,aes(x=aihe, y = suosio)) + 
  geom_boxplot(outlier.size = 3)
q <- q + scale_y_log10(labels=fmt())
q + theme(text= element_text(size=15))

Tweets by topic, hour and popularity

q <- ggplot(tw_fi,aes(topic,tunti,color=aihe, size=suosio)) + 
  geom_point(shape=19, position="jitter", alpha=0.5) + xlab("") + 
  scale_size(range = c(1, 10))
q <- q + theme(text = element_text(size =20),
          legend.key.size=unit(1,"cm"),
          legend.text = element_text(size = 10))

q + guides(colour=guide_legend(override.aes=list(size=5)))

Regression analysis

tw_fi$vastaus <- !is.na(tw_fi$replyToSN)
my_lm <- lm(suosio~aihe+tunti+vastaus,data=tw_fi)

summary(my_lm)

Call:
lm(formula = suosio ~ aihe + tunti + vastaus, data = tw_fi)

Residuals:
   Min     1Q Median     3Q    Max 
-79.54 -40.55 -15.51  13.49 604.04 

Coefficients:
                     Estimate Std. Error t value Pr(>|t|)    
(Intercept)           36.0553     9.0340   3.991 7.29e-05 ***
aiheSuomiNousuun       1.3079     8.1524   0.160   0.8726    
aiheKookoomusTsemppi  15.8333     8.1763   1.936   0.0532 .  
aiheKansanviestit     -7.2558     8.2132  -0.883   0.3773    
tunti                  2.2432     0.5508   4.073 5.20e-05 ***
vastausTRUE          -61.4565     7.3406  -8.372 3.24e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 73.75 on 678 degrees of freedom
Multiple R-squared:  0.1151,    Adjusted R-squared:  0.1086 
F-statistic: 17.64 on 5 and 678 DF,  p-value: < 2.2e-16

A comment from Stubb

The former prime minister commented on the analysis using twitter.