############################################# ### ### Download Springer Books ### Corona Virus Time ### ### carlos@cardenas.pe ### ### GPL 3.0 v ### ### 27/04/2020 ### ############################################# import PyPDF2 import urllib3 import os import wget def download_book_from_page(page_url): http = urllib3.PoolManager() res = http.request('GET', page_url) title = ''.join(res.data.decode('utf-8').split('h1')[1].split('>')[1].split('<')[0].split('/')[0])+".pdf" # skip books already downloaded if os.path.isfile(title): return download_url = "https://link.springer.com/content/"+res.data.decode('utf-8').split('Download book PDF')[0].split('content/')[1].split('title')[0].split('.pdf')[0]+".pdf" wget.download(download_url, title) def process_books_in_pdf(pdf): for i in range(0, pdf.numPages): lines = pdf.getPage(i).extractText().split('\n') i = 0 no_of_lines = len(lines) while i < no_of_lines: if lines[i].startswith("http://"): # changing protocol from http to https url = "https://"+lines[i][7:] print(url) try: download_book_from_page(url) except: print("Error while downloading, trying again.") continue i += 1 def main(): file = open('Spring.pdf', 'rb') pdf = PyPDF2.PdfFileReader(file) process_books_in_pdf(pdf) main()