Graphic by Heikki Ritaluoma
This GitHub page includes explorations of the former prime minister of Finland Alexander Stubb’s tweets, retreived from the twitter API on Jan 2016 and ananalyzed using the R programming language for statistical analysis. The codes for the analysis are available at the StubbTweets repository. An article including the graphics displayed here can be found here (in finnish).
glimpse(tw)
Observations: 1,740
Variables: 20
$ text <chr> "Tässä viimeisin Pyöräily+Triathlon kolumni. Mal...
$ favorited <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,...
$ favoriteCount <dbl> 30, 11, 19, 46, 17, 26, 23, 16, 14, 6, 21, 10, 1...
$ replyToSN <chr> NA, "Lagarde", "TuomasEnbuske", NA, NA, NA, NA, ...
$ created <dttm> 2016-02-19 20:56:03, 2016-02-19 18:44:48, 2016-...
$ truncated <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,...
$ replyToSID <chr> NA, "700722045529755648", "700474680575180800", ...
$ id <dbl> 7.007557e+17, 7.007227e+17, 7.006924e+17, 7.0067...
$ replyToUID <chr> NA, "304909941", "420281182", NA, NA, NA, NA, NA...
$ statusSource <chr> "<a href=\"http://twitter.com/download/iphone\" ...
$ screenName <chr> "alexstubb", "alexstubb", "alexstubb", "alexstub...
$ retweetCount <dbl> 1, 7, 1, 2, 2, 23, 5, 4, 5, 4, 0, 3, 13, 9, 8, 1...
$ isRetweet <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,...
$ retweeted <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,...
$ longitude <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
$ latitude <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
$ popularity <dbl> 31, 18, 20, 48, 19, 49, 28, 20, 19, 10, 21, 13, ...
$ tweet <chr> "tässä viimeisin pyöräilytriathlon kolumni malti...
$ lang <chr> "finnish", "english", "swedish", "finnish", "fin...
$ tunti <dbl> 20, 18, 16, 15, 13, 9, 21, 21, 21, 18, 11, 11, 1...
par(mar=c(8,9,6,4))
h <- hist(tw$created, main = paste(tweeter, "tweets by month"),
breaks="month", freq=T,
xlab="",ylab="", las=2,
labels=T, cex.axis=1.3, cex.lab=1.5,
ylim= c(0,400), format="%Y-%m",
col="cadetblue3")
mtext(side = 2, text = "tweets", line = 5, cex = 1.5)
par(mar=c(7,9,5,3))
h <- hist(tw$created, main = paste(tweeter, "tweets by day"),
breaks="days", freq=T,
xlab="", ylab="",col="grey55", lty=0,
cex.axis=1.3 ,cex.lab=1.5, tck=0.05, xaxt="n")
peaks=2
tickpos <- h$breaks[order(h$counts,decreasing=T)[1:peaks]]
labels <- names(sort(table(format(tw$created,"%d.%m.%Y")),decreasing=T))[1:peaks]
axis(1, at=tickpos, labels=labels,cex.axis=1.5)
mtext(side = 2, text = "tweets", line = 5, cex = 1.5)
mtext(side = 1, text = "day", line = 3, cex = 1.5)
hourly <- table(format(tw$created,"%H"))
hourly_prc <- paste(round(100*hourly/sum(hourly),1),"%")
par(mar=c(8,10,5,3))
bp <- barplot(hourly, main = paste(tweeter, "tweets by hour"),
space=0.5, ylim=c(0,250),
ylab="",cex.lab=1, cex.names = 1.2,
cex.axis=1.2, las=2, col = "deepskyblue3",
xlab="")
text(bp+0.1, hourly, hourly_prc, pos=3, cex=0.7)
mtext(side = 2, text = "tweets", line = 5, cex = 1.3)
mtext(side = 1, text = "time", line = 5, cex = 1.3)
top5tweetdays <- sort(table(Date = format(tw$created,"%Y-%m-%d")),decreasing=T)[1:5]
data.frame(top5tweetdays)
Date | Freq |
---|---|
2015-12-16 | 43 |
2015-04-18 | 36 |
2015-04-01 | 22 |
2015-04-16 | 22 |
2015-03-10 | 21 |
top5days <- names(top5tweetdays)
topdaydata <- lapply(1:4,function(day) {
get_datedata(tw, top5days[day])
})
newpar <- par(mfrow=c(2,2))
for(i in 1:4){
time <- topdaydata[[i]]$created
hist(time, breaks=100,freq=T,border=NULL,ylab="Tweets",
tck=0,cex.axis=0.8, cex.lab=0.8, xlab="",
main=paste0(tweeter," ",top5days[i]),
cex.main=1, ylim=c(0,8))
}
# document term matrices for each language
DTM <- get(load(paste0("data/",tweeter,"_DTM.Rda")))
# word frequencies for each language
FREQ <- lapply(DTM, twitter_wordfreqs)
names(FREQ) <- names(DTM)
# wordclouds
par(mfrow= c(3,1))
temp <- lapply(FREQ, function(lang) {
suppressWarnings(
wordcloud(lang$word, lang$freq,
scale = c(5,1),
random.order=FALSE, colors=brewer.pal(8, "Dark2")))
})
TW <- get(load(paste0("data/",tweeter,"_topicdata.Rda")))
tw_fi <- TW[[2]]
topic_labels <- c("Kannanottoja", "SuomiNousuun","KookoomusTsemppi","Kansanviestit")
tw_fi[["aihe"]] <- factor(tw_fi$topic, labels = topic_labels)
tw_fi <- tw_fi[!is.na(tw_fi$aihe),]
library(dplyr)
group_by(tw_fi, aihe) %>%
summarise(keskisuosio = round(mean(suosio)), mediaanisuosio = round(quantile(suosio, probs=0.5)))
aihe | keskisuosio | mediaanisuosio |
---|---|---|
Kannanottoja | 50 | 34 |
SuomiNousuun | 60 | 33 |
KookoomusTsemppi | 64 | 28 |
Kansanviestit | 49 | 24 |
# popularity by month
df_summary <- get_summary(tw_fi, "kuukausi")
q <- ggplot(df_summary,aes(aika,tweets, size=keski_suosio)) + geom_point()
q <- q + ylab("tweettejä")
q <- q + scale_x_discrete() + xlab("")
q <- q + theme(axis.text.x=element_text(size = 10,angle = -90, hjust = 0),
axis.text.y = element_text(size = 15),
axis.title.y = element_text(size=15),
legend.text = element_text(size = 15),
legend.title = element_text(size=10))
q
# help funtion for scale_y_log10
fmt <- function(){
function(x) format(x, nsmall=0L, scientific = FALSE)
}
q <-ggplot(tw_fi,aes(x=aihe, y = suosio)) +
geom_boxplot(outlier.size = 3)
q <- q + scale_y_log10(labels=fmt())
q + theme(text= element_text(size=15))
q <- ggplot(tw_fi,aes(topic,tunti,color=aihe, size=suosio)) +
geom_point(shape=19, position="jitter", alpha=0.5) + xlab("") +
scale_size(range = c(1, 10))
q <- q + theme(text = element_text(size =20),
legend.key.size=unit(1,"cm"),
legend.text = element_text(size = 10))
q + guides(colour=guide_legend(override.aes=list(size=5)))
tw_fi$vastaus <- !is.na(tw_fi$replyToSN)
my_lm <- lm(suosio~aihe+tunti+vastaus,data=tw_fi)
summary(my_lm)
Call:
lm(formula = suosio ~ aihe + tunti + vastaus, data = tw_fi)
Residuals:
Min 1Q Median 3Q Max
-79.54 -40.55 -15.51 13.49 604.04
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 36.0553 9.0340 3.991 7.29e-05 ***
aiheSuomiNousuun 1.3079 8.1524 0.160 0.8726
aiheKookoomusTsemppi 15.8333 8.1763 1.936 0.0532 .
aiheKansanviestit -7.2558 8.2132 -0.883 0.3773
tunti 2.2432 0.5508 4.073 5.20e-05 ***
vastausTRUE -61.4565 7.3406 -8.372 3.24e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 73.75 on 678 degrees of freedom
Multiple R-squared: 0.1151, Adjusted R-squared: 0.1086
F-statistic: 17.64 on 5 and 678 DF, p-value: < 2.2e-16
The former prime minister commented on the analysis using twitter.