Graphic by Heikki Ritaluoma

Welcome

This GitHub page includes explorations of the former prime minister of Finland Alexander Stubb’s tweets, retreived from the twitter API on Jan 2016 and ananalyzed using the R programming language for statistical analysis. The codes for the analysis are available at the StubbTweets repository. An article including the graphics displayed here can be found here (in finnish).

The data

glimpse(tw)

Observations: 1,740
Variables: 20
$ text          <chr> "Tässä viimeisin Pyöräily+Triathlon kolumni. Mal...
$ favorited     <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,...
$ favoriteCount <dbl> 30, 11, 19, 46, 17, 26, 23, 16, 14, 6, 21, 10, 1...
$ replyToSN     <chr> NA, "Lagarde", "TuomasEnbuske", NA, NA, NA, NA, ...
$ created       <dttm> 2016-02-19 20:56:03, 2016-02-19 18:44:48, 2016-...
$ truncated     <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,...
$ replyToSID    <chr> NA, "700722045529755648", "700474680575180800", ...
$ id            <dbl> 7.007557e+17, 7.007227e+17, 7.006924e+17, 7.0067...
$ replyToUID    <chr> NA, "304909941", "420281182", NA, NA, NA, NA, NA...
$ statusSource  <chr> "<a href=\"http://twitter.com/download/iphone\" ...
$ screenName    <chr> "alexstubb", "alexstubb", "alexstubb", "alexstub...
$ retweetCount  <dbl> 1, 7, 1, 2, 2, 23, 5, 4, 5, 4, 0, 3, 13, 9, 8, 1...
$ isRetweet     <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,...
$ retweeted     <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,...
$ longitude     <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
$ latitude      <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
$ popularity    <dbl> 31, 18, 20, 48, 19, 49, 28, 20, 19, 10, 21, 13, ...
$ tweet         <chr> "tässä viimeisin pyöräilytriathlon kolumni malti...
$ lang          <chr> "finnish", "english", "swedish", "finnish", "fin...
$ tunti         <dbl> 20, 18, 16, 15, 13, 9, 21, 21, 21, 18, 11, 11, 1...

Monthly tweets

par(mar=c(8,9,6,4))
h <- hist(tw$created, main = paste(tweeter, "tweets by month"),
     breaks="month", freq=T, 
     xlab="",ylab="", las=2,
     labels=T, cex.axis=1.3, cex.lab=1.5,
     ylim= c(0,400), format="%Y-%m",
     col="cadetblue3")
mtext(side = 2, text = "tweets", line = 5, cex = 1.5)

Daily tweets

par(mar=c(7,9,5,3))
h <- hist(tw$created, main = paste(tweeter, "tweets by day"),
          breaks="days", freq=T, 
          xlab="", ylab="",col="grey55", lty=0,
          cex.axis=1.3 ,cex.lab=1.5, tck=0.05, xaxt="n")
peaks=2
tickpos <- h$breaks[order(h$counts,decreasing=T)[1:peaks]]
labels <- names(sort(table(format(tw$created,"%d.%m.%Y")),decreasing=T))[1:peaks]
axis(1, at=tickpos, labels=labels,cex.axis=1.5)
mtext(side = 2, text = "tweets", line = 5, cex = 1.5)
mtext(side = 1, text = "day", line = 3, cex = 1.5)

Hourly tweets

hourly <- table(format(tw$created,"%H"))
hourly_prc <- paste(round(100*hourly/sum(hourly),1),"%")
par(mar=c(8,10,5,3))
bp <- barplot(hourly, main = paste(tweeter, "tweets by hour"),
              space=0.5, ylim=c(0,250),
              ylab="",cex.lab=1, cex.names = 1.2,
              cex.axis=1.2, las=2, col = "deepskyblue3",
              xlab="")
text(bp+0.1, hourly, hourly_prc, pos=3, cex=0.7) 
mtext(side = 2, text = "tweets", line = 5, cex = 1.3)
mtext(side = 1, text = "time", line = 5, cex = 1.3)

Most active tweet days

top5tweetdays <- sort(table(Date = format(tw$created,"%Y-%m-%d")),decreasing=T)[1:5]
data.frame(top5tweetdays)

Date	Freq
2015-12-16	43
2015-04-18	36
2015-04-01	22
2015-04-16	22
2015-03-10	21

Tweet times during the most active days

top5days <- names(top5tweetdays)

topdaydata <- lapply(1:4,function(day) {
  get_datedata(tw, top5days[day])
})

newpar <- par(mfrow=c(2,2))
for(i in 1:4){
  time <- topdaydata[[i]]$created
  hist(time, breaks=100,freq=T,border=NULL,ylab="Tweets",
       tck=0,cex.axis=0.8, cex.lab=0.8, xlab="",
       main=paste0(tweeter," ",top5days[i]),
       cex.main=1, ylim=c(0,8))
}

Most used hashtags

hashcounts <- extract_hashes(tw$text)
hash <- head(hashcounts, 5)
kable(hash)

tag	freq
#kokoomus	87
#vaalit2015	78
#eurogroup	62
#työnkautta	38
#euco	24

par(las=2, mar=c(5,14,4,2))
barplot(hash$freq, horiz=T, names.arg=hash$tag,
        cex.names=1.3, cex.axis = 1.5,
        col = "skyblue", border=NA)

Most used words

# document term matrices for each language
DTM <- get(load(paste0("data/",tweeter,"_DTM.Rda")))

# word frequencies for each language
FREQ <- lapply(DTM, twitter_wordfreqs)
names(FREQ) <- names(DTM)

# wordclouds
par(mfrow= c(3,1))
temp <- lapply(FREQ, function(lang) {
  suppressWarnings(
    wordcloud(lang$word, lang$freq,
            scale = c(5,1),
            random.order=FALSE, colors=brewer.pal(8, "Dark2")))
})

Topic model

TW <- get(load(paste0("data/",tweeter,"_topicdata.Rda")))
tw_fi <- TW[[2]]
topic_labels <- c("Kannanottoja", "SuomiNousuun","KookoomusTsemppi","Kansanviestit")
tw_fi[["aihe"]] <- factor(tw_fi$topic, labels = topic_labels)
tw_fi <- tw_fi[!is.na(tw_fi$aihe),]

Average popularity by topic

library(dplyr)
group_by(tw_fi, aihe) %>% 
  summarise(keskisuosio = round(mean(suosio)), mediaanisuosio = round(quantile(suosio, probs=0.5)))

aihe	keskisuosio	mediaanisuosio
Kannanottoja	50	34
SuomiNousuun	60	33
KookoomusTsemppi	64	28
Kansanviestit	49	24

Number of tweets by month and popularity

# popularity by month
df_summary <- get_summary(tw_fi, "kuukausi")
q <- ggplot(df_summary,aes(aika,tweets, size=keski_suosio)) + geom_point() 
q <- q + ylab("tweettejä") 
q <- q + scale_x_discrete() + xlab("")
q <- q + theme(axis.text.x=element_text(size = 10,angle = -90, hjust = 0),
               axis.text.y = element_text(size = 15),
               axis.title.y = element_text(size=15),
               legend.text = element_text(size = 15),
               legend.title = element_text(size=10))
q

Popularity by topics (log scale)

# help funtion for scale_y_log10
fmt <- function(){
  function(x) format(x, nsmall=0L, scientific = FALSE)
}

q <-ggplot(tw_fi,aes(x=aihe, y = suosio)) + 
  geom_boxplot(outlier.size = 3)
q <- q + scale_y_log10(labels=fmt())
q + theme(text= element_text(size=15))

Tweets by topic, hour and popularity

q <- ggplot(tw_fi,aes(topic,tunti,color=aihe, size=suosio)) + 
  geom_point(shape=19, position="jitter", alpha=0.5) + xlab("") + 
  scale_size(range = c(1, 10))
q <- q + theme(text = element_text(size =20),
          legend.key.size=unit(1,"cm"),
          legend.text = element_text(size = 10))

q + guides(colour=guide_legend(override.aes=list(size=5)))

Regression analysis

tw_fi$vastaus <- !is.na(tw_fi$replyToSN)
my_lm <- lm(suosio~aihe+tunti+vastaus,data=tw_fi)

summary(my_lm)


Call:
lm(formula = suosio ~ aihe + tunti + vastaus, data = tw_fi)

Residuals:
   Min     1Q Median     3Q    Max 
-79.54 -40.55 -15.51  13.49 604.04 

Coefficients:
                     Estimate Std. Error t value Pr(>|t|)    
(Intercept)           36.0553     9.0340   3.991 7.29e-05 ***
aiheSuomiNousuun       1.3079     8.1524   0.160   0.8726    
aiheKookoomusTsemppi  15.8333     8.1763   1.936   0.0532 .  
aiheKansanviestit     -7.2558     8.2132  -0.883   0.3773    
tunti                  2.2432     0.5508   4.073 5.20e-05 ***
vastausTRUE          -61.4565     7.3406  -8.372 3.24e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 73.75 on 678 degrees of freedom
Multiple R-squared:  0.1151,    Adjusted R-squared:  0.1086 
F-statistic: 17.64 on 5 and 678 DF,  p-value: < 2.2e-16

A comment from Stubb

The former prime minister commented on the analysis using twitter.

StubbTweets

Tuomo Nieminen

01.02.2016