Simplified logic, changed http to https everhwere, skip files already downloaded
This commit is contained in:
		
							parent
							
								
									d6cdfe3c72
								
							
						
					
					
						commit
						1802fbbdeb
					
				@ -12,40 +12,42 @@
 | 
			
		||||
#############################################
 | 
			
		||||
import PyPDF2
 | 
			
		||||
import urllib3
 | 
			
		||||
import os
 | 
			
		||||
import wget
 | 
			
		||||
 | 
			
		||||
def download(part_page_url):
 | 
			
		||||
    http =urllib3.PoolManager()
 | 
			
		||||
 | 
			
		||||
    page_url="https"+part_url
 | 
			
		||||
def download_book_from_page(page_url):
 | 
			
		||||
    http = urllib3.PoolManager()
 | 
			
		||||
 | 
			
		||||
    res =http.request('GET',page_url)    
 | 
			
		||||
    res = http.request('GET', page_url)
 | 
			
		||||
 | 
			
		||||
    title=''.join(res.data.decode('utf-8').split('h1')[1].split('>')[1].split('<')[0].split('/')[0])+".pdf"
 | 
			
		||||
    title = ''.join(res.data.decode('utf-8').split('h1')[1].split('>')[1].split('<')[0].split('/')[0])+".pdf"
 | 
			
		||||
    
 | 
			
		||||
    # skip books already downloaded
 | 
			
		||||
    if os.path.isfile(title):
 | 
			
		||||
        return
 | 
			
		||||
 | 
			
		||||
    dl_url="https://link.springer.com/content/"+res.data.decode('utf-8').split('Download book PDF')[0].split('content/')[1].split('title')[0].split('.pdf')[0]+".pdf"
 | 
			
		||||
    download_url = "https://link.springer.com/content/"+res.data.decode('utf-8').split('Download book PDF')[0].split('content/')[1].split('title')[0].split('.pdf')[0]+".pdf"
 | 
			
		||||
 | 
			
		||||
    wget.download(dl_url,title)
 | 
			
		||||
    wget.download(download_url, title)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
file =open('Spring.pdf','rb')
 | 
			
		||||
def process_books_in_pdf(pdf):
 | 
			
		||||
    for i in range(0, pdf.numPages):
 | 
			
		||||
        lines = pdf.getPage(i).extractText().split('\n')
 | 
			
		||||
 | 
			
		||||
f= PyPDF2.PdfFileReader(file)
 | 
			
		||||
        for i in range(len(lines)):
 | 
			
		||||
            if lines[i].startswith("http://"):
 | 
			
		||||
                # changing protocol from http to https
 | 
			
		||||
                url = "https://"+lines[i][7:]
 | 
			
		||||
                print(url)
 | 
			
		||||
                download_book_from_page(url)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
for i in range(0,f.numPages):
 | 
			
		||||
def main():
 | 
			
		||||
    file = open('Spring.pdf', 'rb')
 | 
			
		||||
    pdf = PyPDF2.PdfFileReader(file)
 | 
			
		||||
    process_books_in_pdf(pdf)
 | 
			
		||||
 | 
			
		||||
    if i ==0:
 | 
			
		||||
 | 
			
		||||
        for j in range (0, len(f.getPage(i).extractText().split('OpenURL')[1].split('ht'))):
 | 
			
		||||
            if f.getPage(i).extractText().split('OpenURL')[1].split('ht')[j].split('\n')[0] != '':
 | 
			
		||||
                print(f.getPage(i).extractText().split('OpenURL')[1].split('ht')[j].split('\n')[0])
 | 
			
		||||
                foo(f.getPage(i).extractText().split('OpenURL')[1].split('ht')[j].split('\n')[0])
 | 
			
		||||
 | 
			
		||||
    else:
 | 
			
		||||
        for j in range (0, len(f.getPage(i).extractText().split('ht'))):
 | 
			
		||||
            if f.getPage(i).extractText().split('ht')[j].split('\n')[0] !='':
 | 
			
		||||
                if len(f.getPage(i).extractText().split('ht')[j].split('\n')[0])==64:
 | 
			
		||||
                    print(f.getPage(i).extractText().split('ht')[j].split('\n')[0])
 | 
			
		||||
                    foo(f.getPage(i).extractText().split('ht')[j].split('\n')[0])
 | 
			
		||||
 | 
			
		||||
main()
 | 
			
		||||
 | 
			
		||||
		Loading…
	
		Reference in New Issue
	
	Block a user