Simplified logic, changed http to https everhwere, skip files already downloaded

This commit is contained in:
Manish 2020-05-04 10:32:16 +10:00
parent d6cdfe3c72
commit 1802fbbdeb

View File

@ -12,40 +12,42 @@
############################################# #############################################
import PyPDF2 import PyPDF2
import urllib3 import urllib3
import os
import wget import wget
def download(part_page_url):
http =urllib3.PoolManager()
page_url="https"+part_url def download_book_from_page(page_url):
http = urllib3.PoolManager()
res =http.request('GET',page_url) res = http.request('GET', page_url)
title=''.join(res.data.decode('utf-8').split('h1')[1].split('>')[1].split('<')[0].split('/')[0])+".pdf" title = ''.join(res.data.decode('utf-8').split('h1')[1].split('>')[1].split('<')[0].split('/')[0])+".pdf"
# skip books already downloaded
if os.path.isfile(title):
return
dl_url="https://link.springer.com/content/"+res.data.decode('utf-8').split('Download book PDF')[0].split('content/')[1].split('title')[0].split('.pdf')[0]+".pdf" download_url = "https://link.springer.com/content/"+res.data.decode('utf-8').split('Download book PDF')[0].split('content/')[1].split('title')[0].split('.pdf')[0]+".pdf"
wget.download(dl_url,title) wget.download(download_url, title)
file =open('Spring.pdf','rb') def process_books_in_pdf(pdf):
for i in range(0, pdf.numPages):
lines = pdf.getPage(i).extractText().split('\n')
f= PyPDF2.PdfFileReader(file) for i in range(len(lines)):
if lines[i].startswith("http://"):
# changing protocol from http to https
url = "https://"+lines[i][7:]
print(url)
download_book_from_page(url)
for i in range(0,f.numPages): def main():
file = open('Spring.pdf', 'rb')
pdf = PyPDF2.PdfFileReader(file)
process_books_in_pdf(pdf)
if i ==0:
for j in range (0, len(f.getPage(i).extractText().split('OpenURL')[1].split('ht'))):
if f.getPage(i).extractText().split('OpenURL')[1].split('ht')[j].split('\n')[0] != '':
print(f.getPage(i).extractText().split('OpenURL')[1].split('ht')[j].split('\n')[0])
foo(f.getPage(i).extractText().split('OpenURL')[1].split('ht')[j].split('\n')[0])
else:
for j in range (0, len(f.getPage(i).extractText().split('ht'))):
if f.getPage(i).extractText().split('ht')[j].split('\n')[0] !='':
if len(f.getPage(i).extractText().split('ht')[j].split('\n')[0])==64:
print(f.getPage(i).extractText().split('ht')[j].split('\n')[0])
foo(f.getPage(i).extractText().split('ht')[j].split('\n')[0])
main()