1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108
|
import os import re
import shutil import requests def regrex_demo(): pattern = r'https://img201[7-9].cnblogs.com/blog/\d{6}/20\d{4}/\d{6}-\d{,}-\d{,}.png' string = """![flops](https://kezunlin.me/images/posts/635233-20190912095826925-710547982.png) hello world ![flops](https://kezunlin.me/images/posts/635233-20190912095826925-710547982.png)""" results = re.findall(pattern,string) for url in results: print(url) def get_filepaths(root_dir): filepaths = [] for filename in os.listdir(root_dir): filepath = os.path.sep.join([root_dir, filename]) filepaths.append(filepath) return filepaths
def makesure_dir(dir): if not os.path.exists(dir): os.makedirs(dir)
def find_cnblog_image_urls(filepath): pattern = r'https*://images201[0-9].cnblogs.com/blog/\d{6,8}/20\d{4}/\d{6,8}-\d{,}-\d{,}.jpg' urls = [] with open(filepath,"r") as f: contents = f.read().replace('\n', '') urls = re.findall(pattern,contents) return urls
def download_image(url, to_file): r = requests.get(url, stream=True) if r.status_code == 200: with open(to_file, 'wb') as f: shutil.copyfileobj(r.raw, f) print("Save to ",to_file)
def download_image_by_chunk(url, to_file): r = requests.get(url, stream=True) if r.status_code == 200: with open(to_file, 'wb') as f: for chunk in r.iter_content(1024): f.write(chunk) print("Save to ",to_file)
def replace_inplace(filepath, old_string, new_string): f = open(filepath,'r') filedata = f.read() f.close()
newdata = filedata.replace(old_string,new_string)
f = open(filepath,'w') f.write(newdata) f.close()
def download_image_wrapper(url, to_dir): """ donwnload image from url and return new url """ filename = url.split("/")[-1] to_file = os.path.sep.join([to_dir, filename]) download_image(url,to_file) new_url = "https://kezunlin.me/images/posts/{}".format(filename) return new_url
def process_all_posts(): to_dir = "images/posts" makesure_dir(to_dir) posts_dir = "test_posts" posts_dir = "_posts" filepaths = get_filepaths(posts_dir) for filepath in filepaths: print("="*20) print(filepath) urls = find_cnblog_image_urls(filepath) for url in urls: new_url = download_image_wrapper(url,to_dir) replace_inplace(filepath,url,new_url)
def main(): process_all_posts() if __name__ =="__main__": main()
""" grep -r "cnblogs.com/blog" source/_posts """
|