diff --git a/parsers/__init__.py b/parsers/__init__.py index c19aafa9..52bed7f2 100644 --- a/parsers/__init__.py +++ b/parsers/__init__.py @@ -12,6 +12,7 @@ cnn.CNNParser politico.PoliticoParser bbc.BBCParser +telegraaf.TelegraafParser """.split() parser_dict = {} diff --git a/parsers/telegraaf.py b/parsers/telegraaf.py new file mode 100644 index 00000000..a5c51c6c --- /dev/null +++ b/parsers/telegraaf.py @@ -0,0 +1,41 @@ +from baseparser import BaseParser +from BeautifulSoup import BeautifulSoup, Tag, Comment + + +class TelegraafParser(BaseParser): + domains = ['www.telegraaf.nl'] + + feeder_base = 'http://www.telegraaf.nl' + feeder_pat = '^http://www.telegraaf.nl/\w+/\d+/' + + def _parse(self, html): + soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES, + fromEncoding='utf-8') + + self.meta = soup.findAll('meta') + + article = soup.find('div', id = 'artikel') + title = article.find('h1') + + self.title = '' + for i in title.childGenerator(): + # Skip comments + if isinstance(i, Comment): + continue + + self.title += i.lstrip() + + self.byline = '' + + date = article.find('div', 'artDatePostings') + self.date = date.find('span', 'datum').getText() + + article_column = soup.find('div', id = 'artikelKolom') + + self.body = '' + for i in article_column.childGenerator(): + if not isinstance(i, Tag): + continue + if not i.name == 'p': + continue + self.body += i.getText() + '\n\n' diff --git a/website/frontend/models.py b/website/frontend/models.py index 64170af0..ddbc617e 100644 --- a/website/frontend/models.py +++ b/website/frontend/models.py @@ -21,6 +21,7 @@ def strip_prefix(string, prefix): 'edition.cnn.com': 'CNN', 'www.bbc.co.uk': 'BBC', 'www.politico.com': 'Politico', + 'www.telegraaf.nl': 'Telegraaf', } ancient = datetime(1901, 1, 1) diff --git a/website/frontend/views.py b/website/frontend/views.py index 4b0a7487..577f8b20 100644 --- a/website/frontend/views.py +++ b/website/frontend/views.py @@ -95,7 +95,7 @@ def get_articles(source=None, distance=0): return articles -SOURCES = 'nytimes.com cnn.com politico.com bbc.co.uk'.split() + [''] +SOURCES = 'nytimes.com cnn.com politico.com bbc.co.uk telegraaf.nl'.split() + [''] @cache_page(60 * 30) #30 minute cache def browse(request, source=''):