readtime/readtime.py

import re
import math

from pelican import signals
from html.parser import HTMLParser  #use html.parser for Python 3.6


# http://en.wikipedia.org/wiki/Words_per_minute
WPM = 230.0


class MLStripper(HTMLParser):
    def __init__(self):
        super().__init__()    	# subclassing HTMLParser, also need to calling
				# super class's '__init__' method
        self.reset()
        self.fed = []

    #this method is called whenever a 'data' is encountered.
    def handle_data(self, d):
        self.fed.append(d)

    # join all content word into one long sentence for further processing
    def get_data(self):
        return ''.join(self.fed)


def strip_tags(html):
    s = MLStripper()
    s.feed(html)   		# Feed the class with html content, get the fed list
    return s.get_data()


def calculate_readtime(content_object):
    if content_object._content is not None:
        content = content_object._content	# get the content html from Pelican

        text = strip_tags(content)		#strip tags and get long sentence
        words = re.split(r'[^0-9A-Za-z]+', text) # split the long sentence into list of words

        num_words = len(words)  	# count the words
        minutes = int(math.ceil(num_words / WPM))  #calculate the minutes

	    #set minimum read time to 1 minutes.
        if minutes == 0:
            minutes = 1

        content_object.readtime = {
            "minutes": minutes,
        }


def register():
    signals.content_object_init.connect(calculate_readtime)   # connect with 'content_object_init' signal.