@ -12,40 +12,42 @@
#############################################
import PyPDF2
import urllib3
import os
import wget
def download ( part_page_url ) :
http = urllib3 . PoolManager ( )
page_url = " https " + part_url
def download_book_from_page ( page_url ) :
http = urllib3 . PoolManager ( )
res = http . request ( ' GET ' , page_url )
title = ' ' . join ( res . data . decode ( ' utf-8 ' ) . split ( ' h1 ' ) [ 1 ] . split ( ' > ' ) [ 1 ] . split ( ' < ' ) [ 0 ] . split ( ' / ' ) [ 0 ] ) + " .pdf "
dl_url = " https://link.springer.com/content/ " + res . data . decode ( ' utf-8 ' ) . split ( ' Download book PDF ' ) [ 0 ] . split ( ' content/ ' ) [ 1 ] . split ( ' title ' ) [ 0 ] . split ( ' .pdf ' ) [ 0 ] + " .pdf "
# skip books already downloaded
if os . path . isfile ( title ) :
return
wget . download ( dl_url , title )
download_url = " https://link.springer.com/content/ " + res . data . decode ( ' utf-8 ' ) . split ( ' Download book PDF ' ) [ 0 ] . split ( ' content/ ' ) [ 1 ] . split ( ' title ' ) [ 0 ] . split ( ' .pdf ' ) [ 0 ] + " .pdf "
wget . download ( download_url , title )
file = open ( ' Spring.pdf ' , ' rb ' )
f = PyPDF2 . PdfFileReader ( file )
def process_books_in_pdf ( pdf ) :
for i in range ( 0 , pdf . numPages ) :
lines = pdf . getPage ( i ) . extractText ( ) . split ( ' \n ' )
for i in range ( len ( lines ) ) :
if lines [ i ] . startswith ( " http:// " ) :
# changing protocol from http to https
url = " https:// " + lines [ i ] [ 7 : ]
print ( url )
download_book_from_page ( url )
for i in range ( 0 , f . numPages ) :
if i == 0 :
for j in range ( 0 , len ( f . getPage ( i ) . extractText ( ) . split ( ' OpenURL ' ) [ 1 ] . split ( ' ht ' ) ) ) :
if f . getPage ( i ) . extractText ( ) . split ( ' OpenURL ' ) [ 1 ] . split ( ' ht ' ) [ j ] . split ( ' \n ' ) [ 0 ] != ' ' :
print ( f . getPage ( i ) . extractText ( ) . split ( ' OpenURL ' ) [ 1 ] . split ( ' ht ' ) [ j ] . split ( ' \n ' ) [ 0 ] )
foo ( f . getPage ( i ) . extractText ( ) . split ( ' OpenURL ' ) [ 1 ] . split ( ' ht ' ) [ j ] . split ( ' \n ' ) [ 0 ] )
def main ( ) :
file = open ( ' Spring.pdf ' , ' rb ' )
pdf = PyPDF2 . PdfFileReader ( file )
process_books_in_pdf ( pdf )
else :
for j in range ( 0 , len ( f . getPage ( i ) . extractText ( ) . split ( ' ht ' ) ) ) :
if f . getPage ( i ) . extractText ( ) . split ( ' ht ' ) [ j ] . split ( ' \n ' ) [ 0 ] != ' ' :
if len ( f . getPage ( i ) . extractText ( ) . split ( ' ht ' ) [ j ] . split ( ' \n ' ) [ 0 ] ) == 64 :
print ( f . getPage ( i ) . extractText ( ) . split ( ' ht ' ) [ j ] . split ( ' \n ' ) [ 0 ] )
foo ( f . getPage ( i ) . extractText ( ) . split ( ' ht ' ) [ j ] . split ( ' \n ' ) [ 0 ] )
main ( )