From 1802fbbdebff97c09f27cd8e920d028c9c6ce345 Mon Sep 17 00:00:00 2001 From: Manish Date: Mon, 4 May 2020 10:32:16 +1000 Subject: [PATCH] Simplified logic, changed http to https everhwere, skip files already downloaded --- Springer-Libros.py | 48 ++++++++++++++++++++++++---------------------- 1 file changed, 25 insertions(+), 23 deletions(-) diff --git a/Springer-Libros.py b/Springer-Libros.py index 588b510..34ab966 100644 --- a/Springer-Libros.py +++ b/Springer-Libros.py @@ -12,40 +12,42 @@ ############################################# import PyPDF2 import urllib3 +import os import wget -def download(part_page_url): - http =urllib3.PoolManager() - page_url="https"+part_url +def download_book_from_page(page_url): + http = urllib3.PoolManager() - res =http.request('GET',page_url) + res = http.request('GET', page_url) - title=''.join(res.data.decode('utf-8').split('h1')[1].split('>')[1].split('<')[0].split('/')[0])+".pdf" + title = ''.join(res.data.decode('utf-8').split('h1')[1].split('>')[1].split('<')[0].split('/')[0])+".pdf" + + # skip books already downloaded + if os.path.isfile(title): + return - dl_url="https://link.springer.com/content/"+res.data.decode('utf-8').split('Download book PDF')[0].split('content/')[1].split('title')[0].split('.pdf')[0]+".pdf" + download_url = "https://link.springer.com/content/"+res.data.decode('utf-8').split('Download book PDF')[0].split('content/')[1].split('title')[0].split('.pdf')[0]+".pdf" - wget.download(dl_url,title) + wget.download(download_url, title) -file =open('Spring.pdf','rb') +def process_books_in_pdf(pdf): + for i in range(0, pdf.numPages): + lines = pdf.getPage(i).extractText().split('\n') -f= PyPDF2.PdfFileReader(file) + for i in range(len(lines)): + if lines[i].startswith("http://"): + # changing protocol from http to https + url = "https://"+lines[i][7:] + print(url) + download_book_from_page(url) -for i in range(0,f.numPages): +def main(): + file = open('Spring.pdf', 'rb') + pdf = PyPDF2.PdfFileReader(file) + process_books_in_pdf(pdf) - if i ==0: - - for j in range (0, len(f.getPage(i).extractText().split('OpenURL')[1].split('ht'))): - if f.getPage(i).extractText().split('OpenURL')[1].split('ht')[j].split('\n')[0] != '': - print(f.getPage(i).extractText().split('OpenURL')[1].split('ht')[j].split('\n')[0]) - foo(f.getPage(i).extractText().split('OpenURL')[1].split('ht')[j].split('\n')[0]) - - else: - for j in range (0, len(f.getPage(i).extractText().split('ht'))): - if f.getPage(i).extractText().split('ht')[j].split('\n')[0] !='': - if len(f.getPage(i).extractText().split('ht')[j].split('\n')[0])==64: - print(f.getPage(i).extractText().split('ht')[j].split('\n')[0]) - foo(f.getPage(i).extractText().split('ht')[j].split('\n')[0]) +main()