Les archives de « le monde » :
def getArchiveLinks(daystart, dayend, monthstart, monthend):
dates = [str(i).zfill(2)+"-"+str(j).zfill(2) +
"-2019" for i in range(daystart, dayend) for j in range(monthstart, monthend)]
archive_links = [
"https://www.lemonde.fr/archives-du-monde/" + date + "/" for date in dates]
return archive_links
archive_links = getArchiveLinks(1,29,1,9)
print(archive_links[:5])## ['https://www.lemonde.fr/archives-du-monde/01-01-2019/', 'https://www.lemonde.fr/archives-du-monde/01-02-2019/', 'https://www.lemonde.fr/archives-du-monde/01-03-2019/', 'https://www.lemonde.fr/archives-du-monde/01-04-2019/', 'https://www.lemonde.fr/archives-du-monde/01-05-2019/']
beautiful soupdef getArticlesLinks(archive_links):
links_non_abonne = []
for link in archive_links:
try:
html = urlopen(link)
except HTTPError as e:
print("text url not valid", link)
soup = BeautifulSoup(html, "html.parser")
temp = soup.find_all(class_="teaser")
for item in temp:
# condition here : if no span sr-only (abonnes)
if not item.find('span', {'class': 'sr-only'}):
links_non_abonne.append(item.find('a')['href'])
return links_non_abonneteasersr-onlydef classifyLinks(themeList, linkFile):
dict_links = defaultdict(list)
for theme in themeList:
theme_link = 'https://www.lemonde.fr/'+theme+'.*'
p = re.compile(theme_link)
theme_links = p.findall(links)
[dict_links[theme].append(link) for link in theme_links if 'en-direct' not in link]
return dict_links
links = readFile('lemonde/lemondeLinks.txt')
themes = ['culture', 'sport', 'economie']
dict_links = classifyLinks(themes, links)
for key, value in dict_links.items():
print(key, len(value))
# culture 426
# sport 403
# economie 334articleh1h2 et pdef getSinglePage(url):
try:
html = urlopen(url)
except HTTPError as e:
print("text url not valid")
soup = BeautifulSoup(html, "html.parser")
with open('html.html','w') as f:
f.write(soup.prettify())
text_title = soup.find('h1')
text_body = soup.article.find_all(["p", "h2"], recursive=False)
return (text_title, text_body)