Forked from https://github.com/unimauro/Springer202004Books . "As part of its response to COVID-19, Springer has made available
several text books for free download." This script automatically download those books.
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
60 lines
1.5 KiB
60 lines
1.5 KiB
############################################# |
|
### |
|
### Download Springer Books |
|
### Corona Virus Time |
|
### |
|
### [email protected] |
|
### |
|
### GPL 3.0 v |
|
### |
|
### 27/04/2020 |
|
### |
|
############################################# |
|
import PyPDF2 |
|
import urllib3 |
|
import os |
|
import wget |
|
|
|
|
|
def download_book_from_page(page_url): |
|
http = urllib3.PoolManager() |
|
|
|
res = http.request('GET', page_url) |
|
|
|
title = ''.join(res.data.decode('utf-8').split('h1')[1].split('>')[1].split('<')[0].split('/')[0])+".pdf" |
|
|
|
# skip books already downloaded |
|
if os.path.isfile(title): |
|
return |
|
|
|
download_url = "https://link.springer.com/content/"+res.data.decode('utf-8').split('Download book PDF')[0].split('content/')[1].split('title')[0].split('.pdf')[0]+".pdf" |
|
|
|
wget.download(download_url, title) |
|
|
|
|
|
def process_books_in_pdf(pdf): |
|
for i in range(0, pdf.numPages): |
|
lines = pdf.getPage(i).extractText().split('\n') |
|
|
|
i = 0 |
|
no_of_lines = len(lines) |
|
while i < no_of_lines: |
|
if lines[i].startswith("http://"): |
|
# changing protocol from http to https |
|
url = "https://"+lines[i][7:] |
|
print(url) |
|
try: |
|
download_book_from_page(url) |
|
except: |
|
print("Error while downloading, trying again.") |
|
continue |
|
i += 1 |
|
|
|
|
|
def main(): |
|
file = open('Spring.pdf', 'rb') |
|
pdf = PyPDF2.PdfFileReader(file) |
|
process_books_in_pdf(pdf) |
|
|
|
|
|
main()
|
|
|