Topic Modelling on Fact check Articles

This notebook builds LDA topic models on the headlines of 13,000+ fact-checking stories in the Tattle archive.

Why headlines?

Because the body text of fact-checking stories often contains text which is not related to the misinformation being fact-checked. The headlines, however, are focused on the misinformation.

Brief description of the model

Latent Dirichlet Allocation is an unsupervised, probabilistic model that generates topic-document and word-topic probability distributions from a collection of text documents. Topics are themes that occur in documents. Here, document = story headline.

Process

  1. Getting the data from MongoDB and storing locally
  2. Text cleaning (removing noise, English / non-English separation using regex)
  3. Translating non-English headlines and storing translations locally
  4. Pre-processing all the headlines (tokenizing, stop word removal, lemmatizing, creating bigrams)
  5. Text transformation: creating a corpus of vectors
  6. Selecting the number of topics
  7. Building the topic model
  8. Topic interpretation
  9. Interactive model visualisation
  10. N-gram analysis of the headlines
  11. Possible applications

1. Getting the data

Importing libraries

import os

import requests
import time
from time import sleep
from random import uniform
import datetime
from datetime import date
import csv
from pymongo import MongoClient
from dotenv import load_dotenv
load_dotenv()
import os
from os import environ
import re
import numpy as np
import pandas as pd
from pprint import pprint
import nltk
from nltk.corpus import stopwords
import spacy
import gensim
import gensim.corpora as corpora
from gensim.corpora import Dictionary
from gensim.utils import simple_preprocess
from gensim.models import ldamodel
from gensim.models import CoherenceModel
import re
from langdetect import detect
from gensim.models.phrases import Phrases, Phraser
from nltk import FreqDist
from nltk.corpus import RegexpTokenizer as regextoken
import matplotlib
from matplotlib import pyplot as plt
%matplotlib inline
import logging
logging.basicConfig(filename='lda_model.log', format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
import googletrans
from googletrans import Translator
import pyLDAvis
import pyLDAvis.gensim
from gensim import similarities

Get data from factchecking sites MongoDB

def initialize_mongo():
    mongo_url = "mongodb+srv://os.environ.get("FACTCHECK_DB_USERNAME"):os.environ.get("FACTCHECK_DB_PASSWORD")@tattle-data-fkpmg.mongodb.net/test?retryWrites=true&w=majority&ssl=true&ssl_cert_reqs=CERT_NONE"
    cli = MongoClient(mongo_url)
    db = cli[os.environ.get("FACTCHECK_DB_NAME")]
    coll = db[os.environ.get("FACTCHECK_DB_COLLECTION")]
    if coll.count_documents({}) > 0:
        return coll
    else:
        print("Error accessing Mongo collection")
        sys.exit()

coll = initialize_mongo()
df = pd.DataFrame.from_records(coll.find({}))
df.to_csv("factchecking_stories.csv", index=False)
df = pd.read_csv("factchecking_stories.csv")
df = df.drop_duplicates()
# Snapshot of headlines
df["headline"][0:3]
0    Madhu Kishwar tweets photoshopped image of Amu...
1    Photoshopped promo of ABP News over Chandrayaa...
2    Video of scuffle between men in khaki uniform ...
Name: headline, dtype: object

2. Text cleaning

# Save headlines in a variable
raw_data = df["headline"].values.tolist()

# Defining a function to remove accented characters in the headlines
def data_dict(sentences):
    return dict((sentence, ", ".join(simple_preprocess(str(sentence), deacc=True, max_len=100))) for sentence in sentences)

result = data_dict(raw_data)

# Separating non-English headlines using regex
pat = re.compile("[^\x00-\x7F]") # matches non-English characters
non_eng = [k for k,v in result.items() if pat.search(v)]
eng = [k for k,v in result.items() if not pat.search(v)]

print(len(eng), len(non_eng))
8298 5241

3. Translating non-English headlines

Googletrans is a free library that sends translation requests to the Google Translate API. Random time delays between requests are advised, else Google may (and probably will) block your ip address.

Translating the 5200+ non-English headlines took approximately 6.5 hours.

# Translating non-English headlines using googletrans library

translator = Translator()
translations = []
for doc in non_eng:
    translations.append(translator.translate(doc).text)
    time.sleep(uniform(3,5))
# Saving the original and translated headlines for future reference
translated_headlines = dict(zip(non_eng, translations))
translations_df = pd.DataFrame(translated_headlines.items(), columns = ["headline", "translation"])
translations_df["original_english"] = 0
translations_df = translations_df.append(pd.DataFrame(eng, columns=['headline']), ignore_index=True, sort=True)
translations_df["original_english"].fillna(value=1, inplace = True)
translations_df["original_english"] = translations_df["original_english"].astype(int)
translations_df.to_csv("headlines_with_translations.csv")

4. Text preprocessing

# Combining the headlines
all_headlines = eng + translations
# Tokenizing the headlines
all_tokens = list(sent_to_words(all_headlines))
# Creating stop words list
stop_words = stopwords.words("english")
# Adding domain words
stop_words.extend(["fake", "fact", "check", "checked", "factcheck", "news", "false",
                   "falsely", "true", "truth", "viral", "video", "image", "picture",
                   "photo", "claim", "claiming", "share", "clip", "misleading"])
# Stop word removal
data_stopped = [[word for word in doc if word not in stop_words] for doc in all_tokens]
# Creating bigrams
bigram = gensim.models.Phrases(data_stopped, min_count=10)
for idx in range(len(data_stopped)):
    for token in bigram[data_stopped[idx]]:
        if '_' in token:
            # If token is bigram, add it to document
            data_stopped[idx].append(token)

data_with_bigrams = data_stopped
# Lemmatizing i.e. reducing words to their root form
# Including only nouns as this improves both topic interpretability and coherence scores
def lemmatization(docs, allowed_postags=["NOUN", "PROPN"]):
    nlp = spacy.load("en_core_web_sm")
    docs_out = []
    for sent in docs:
        doc = nlp(" ".join(sent))
        docs_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return docs_out

data_lemmatized = lemmatization(data_with_bigrams, allowed_postags=["NOUN", "PROPN"])

# Removing any stopwords created because of lemmatization
data_cleaned = [[word for word in doc if word not in stop_words] for doc in data_lemmatized]

5. Text transformation: creating a corpus of vectors

Topic modelling with the Gensim library involves documents, corpus, vectors and bag of words. These are explained here - https://radimrehurek.com/gensim/auto_examples/core/run_core_concepts.html

# Creating a dictionary
id2word = corpora.Dictionary(data_cleaned)
# Creating a document-term matrix
print('Number of unique tokens: %d' % len(id2word))
id2word.filter_extremes(no_below = 20)
# Creating a document-term matrix
corpus = [id2word.doc2bow(doc) for doc in data_cleaned]
Number of unique tokens: 7585

6. Selecting the number of topics

Topic coherence is an evaluation metric for topic models that measures the degree of semantic similarity between high scoring words in the topic. The graph below helps find an appropriate 'k' number of topics to model.

def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model=ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, passes=5, per_word_topics = True, chunksize=100, alpha='auto',
    eta='auto', eval_every=1, random_state = 0)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values
model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=data_cleaned, start=2, limit=20, step=1)
# Show graph
limit=20; start=2; step=1;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()
2020-05-06 16:54:18,831 [20296] WARNING  gensim.models.ldamodel:145: [JupyterRequire] updated prior is not positive
2020-05-06 16:54:18,917 [20296] WARNING  gensim.models.ldamodel:145: [JupyterRequire] updated prior is not positive
2020-05-06 16:54:18,989 [20296] WARNING  gensim.models.ldamodel:145: [JupyterRequire] updated prior is not positive
2020-05-06 16:54:19,051 [20296] WARNING  gensim.models.ldamodel:145: [JupyterRequire] updated prior is not positive
2020-05-06 16:54:41,787 [20296] WARNING  gensim.models.ldamodel:145: [JupyterRequire] updated prior is not positive
2020-05-06 16:54:41,886 [20296] WARNING  gensim.models.ldamodel:145: [JupyterRequire] updated prior is not positive
2020-05-06 16:55:03,933 [20296] WARNING  gensim.models.ldamodel:145: [JupyterRequire] updated prior is not positive

png

7. Building the topic model

We choose k=7 as k=11 produced some topics which were not as meaningful.

lda_model = ldamodel.LdaModel(corpus = corpus, num_topics = 7, id2word = id2word, chunksize=100, alpha='auto',
    eta='auto', passes = 5, random_state = 0)

# Printing the topic-word probabilities
pprint(lda_model.print_topics())
[(0,
  '0.094*"bjp" + 0.057*"people" + 0.057*"medium" + 0.044*"modi" + '
  '0.038*"social_media" + 0.036*"temple" + 0.032*"verification" + '
  '0.030*"state" + 0.027*"stock" + 0.026*"statement"'),
 (1,
  '0.123*"virus" + 0.066*"box" + 0.064*"corona_virus" + 0.056*"year" + '
  '0.041*"coronavirus" + 0.040*"student" + 0.039*"coro" + 0.035*"china" + '
  '0.032*"message" + 0.031*"girl"'),
 (2,
  '0.101*"india" + 0.047*"post" + 0.041*"muslim" + 0.041*"country" + '
  '0.041*"lockdown" + 0.036*"rally" + 0.033*"riot" + 0.030*"president" + '
  '0.027*"house" + 0.025*"shah"'),
 (3,
  '0.140*"police" + 0.089*"woman" + 0.081*"name" + 0.049*"man" + 0.049*"child" '
  '+ 0.038*"leader" + 0.028*"worker" + 0.027*"rumor" + 0.026*"khan" + '
  '0.025*"death"'),
 (4,
  '0.090*"gandhi" + 0.069*"minister" + 0.041*"election" + 0.041*"event" + '
  '0.040*"rahul_gandhi" + 0.035*"prime_minister" + 0.034*"color" + '
  '0.032*"body" + 0.028*"vote" + 0.026*"communal_color"'),
 (5,
  '0.058*"rape" + 0.051*"time" + 0.037*"army" + 0.036*"evidence" + '
  '0.036*"food" + 0.030*"mock" + 0.030*"mock_drill" + 0.028*"drill" + '
  '0.028*"day" + 0.026*"footage"'),
 (6,
  '0.091*"delhi" + 0.081*"caa" + 0.055*"protest" + 0.038*"government" + '
  '0.038*"person" + 0.035*"attack" + 0.032*"congress" + 0.030*"kashmir" + '
  '0.030*"money" + 0.028*"pradesh"')]

8. Topic interpretation

The above topic-word probability distributions can be interpreted as 7 topics or themes that describe the story headlines:

  • BJP, Modi & the media
  • Coronavirus
  • General national politics
  • Identity / people
  • Rahul Gandhi, Congress & national politics
  • Law enforcement & crime
  • Citizenship act protests & different states

9. Interactive model visualisation

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model3, corpus, id2word, sort_topics=True, n_jobs=1)
vis
2020-05-06 17:18:32,332 [20296] WARNING  py.warnings:110: [JupyterRequire] /Users/kruttikanadig/anaconda3/lib/python3.7/site-packages/pyLDAvis/_prepare.py:257: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.

  return pd.concat([default_term_info] + list(topic_dfs))

10. N-gram analysis of headlines

# Redoing text processing with adjectives, verbs and adverbs included
def lemmatization(docs, allowed_postags=["NOUN", "PROPN", "VERB", "ADJ", "ADV"]):
    nlp = spacy.load("en_core_web_sm")
    docs_out = []
    for sent in docs:
        doc = nlp(" ".join(sent))
        docs_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return docs_out

data_lemmatized = lemmatization(data_with_bigrams, allowed_postags=["NOUN", "PROPN", "VERB", "ADJ", "ADV"])

# Removing any stopwords created because of lemmatization
data_cleaned = [[word for word in doc if word not in stop_words] for doc in data_lemmatized]
# Plot frequency distribution of most common words in the headlines
all_sents = [" ".join(doc) for doc in data_cleaned]
tokenizer = regextoken("[a-zA-Z0-9]+")
all_words = tokenizer.tokenize(str(all_sents))
fd = FreqDist()
for word in all_words:
    fd[word] += 1

# Plotting the top 50 most frequent words
# plt.figure(figsize = (15, 10))
# fd.plot(50)
# plt.show()
fd_sorted={k: v for k, v in sorted(fd.items(), key=lambda item: item[1], reverse=True)}
fd_sorted=dict(list(fd_sorted.items())[0: 100])
plt.figure(figsize = (15, 10))
plt.suptitle("Top words in 13,539 story headlines on Indian factchecking sites (as of May 2020)", fontsize= 15)
plt.title("source: Tattle archive", fontdict = {'fontsize': 12})
plt.xlabel(xlabel = "Word", fontdict = {'fontsize': 13})
plt.ylabel(ylabel = "Count", fontdict = {'fontsize': 13})
plt.xticks(rotation=90, size =11)
plt.yticks(np.arange(0, 2000, 100))
plt.plot(fd_sorted.keys(), fd_sorted.values())
plt.grid()
plt.tight_layout(pad=4)
plt.savefig("word_plot.png", bbox_inches='tight')
plt.show()

png

# # Plot frequency distribution of most common word-pairs in the headlines
bigrams_list=[]
for idx in range(len(data_cleaned)):
    for token in bigram[data_cleaned[idx]]:
        if '_' in token:
            # If token is bigram, add it to document
            bigrams_list.append(token)

fd = FreqDist()
for b in bigrams_list:
    fd[b] += 1

fd_sorted={k: v for k, v in sorted(fd.items(), key=lambda item: item[1], reverse=True)}
fd_sorted=dict(list(fd_sorted.items())[0: 100])
plt.figure(figsize = (15, 10))
plt.suptitle("Top bigrams in 13,539 story headlines on Indian factchecking sites (as of May 2020)", fontsize= 15)
plt.title("source: Tattle archive", fontdict = {'fontsize': 12})
plt.xlabel(xlabel = "Bigram", fontdict = {'fontsize': 13})
plt.ylabel(ylabel = "Count", fontdict = {'fontsize': 13})
plt.xticks(rotation=90, size =9)
plt.yticks(np.arange(0, 800, 50))
plt.plot(fd_sorted.keys(), fd_sorted.values())
plt.grid()
plt.tight_layout(pad=4)
plt.savefig("bigram_plot.png", bbox_inches='tight')
plt.show()

png

11. Possible applications

We have performed topic modelling and ngram analysis on 13,000+ fact-checking story headlines, including headlines that were originally in Indian languages. Some possible applications of this analysis are:

  • Content selection: Targeting content categories aligned with the produced topics / words while running social media scrapers
  • Content filtering: Using top words from topic model and ngram analysis along with image text-extraction tool to triage new content for the archive
  • Content enrichment: Offering some of these topics as tags / labels in the annotation UI
  • Communications around Tattle's archive
Text and illustrations on the website is licensed under Creative Commons 4.0 License. The code is licensed under GPL. For data, please look at respective licenses.