how to find string and replace with python regex


Regex Online Demo

regex online demo

Code Example

#!/usr/bin/python
# -*- coding: UTF-8 -*-

import os 
import re 

import shutil
import requests

def regrex_demo():
    pattern = r'https://img201[7-9].cnblogs.com/blog/\d{6}/20\d{4}/\d{6}-\d{,}-\d{,}.png'
    string = """![flops](https://kezunlin.me/images/posts/635233-20190912095826925-710547982.png)
            hello world
            ![flops](https://kezunlin.me/images/posts/635233-20190912095826925-710547982.png)"""
    results = re.findall(pattern,string) # list
    for url in results:
        print(url)

def get_filepaths(root_dir):
    filepaths = []
    for filename in os.listdir(root_dir):
        filepath = os.path.sep.join([root_dir, filename]) 
        filepaths.append(filepath)
    return filepaths 

def makesure_dir(dir):  
    if not os.path.exists(dir):
        os.makedirs(dir)

def find_cnblog_image_urls(filepath):
    # img2017 img2018  img2019
    # images2017 images2018  images2019
    #pattern = r'https*://img201[0-9].cnblogs.com/blog/\d{6,8}/20\d{4}/\d{6,8}-\d{,}-\d{,}.png'
    #pattern = r'https*://img201[0-9].cnblogs.com/blog/\d{6,8}/20\d{4}/\d{6,8}-\d{,}-\d{,}.jpg'

    #pattern = r'https*://images201[0-9].cnblogs.com/blog/\d{6,8}/20\d{4}/\d{6,8}-\d{,}-\d{,}.png'
    pattern = r'https*://images201[0-9].cnblogs.com/blog/\d{6,8}/20\d{4}/\d{6,8}-\d{,}-\d{,}.jpg'

    urls = []
    with open(filepath,"r") as f:
        contents = f.read().replace('\n', '') # read file into one string 
        urls = re.findall(pattern,contents) # list
    return urls

def download_image(url, to_file):
    r = requests.get(url, stream=True)
    if r.status_code == 200:
        with open(to_file, 'wb') as f:
            #r.raw.decode_content = True
            shutil.copyfileobj(r.raw, f) 
        print("Save to ",to_file)

def download_image_by_chunk(url, to_file):
    r = requests.get(url, stream=True)
    if r.status_code == 200:
        with open(to_file, 'wb') as f:
            for chunk in r.iter_content(1024):
                f.write(chunk)
            print("Save to ",to_file)

def replace_inplace(filepath, old_string, new_string):
    f = open(filepath,'r')
    filedata = f.read()
    f.close()

    newdata = filedata.replace(old_string,new_string)

    f = open(filepath,'w')
    f.write(newdata)
    f.close()

def download_image_wrapper(url, to_dir):
    """
    donwnload image from url and return new url
    """
    filename = url.split("/")[-1]  
    to_file = os.path.sep.join([to_dir, filename])
    download_image(url,to_file)

    new_url = "https://kezunlin.me/images/posts/{}".format(filename)
    return new_url 

def process_all_posts():
    to_dir = "images/posts" # images dir
    makesure_dir(to_dir)

    #posts_dir = "../source/_posts/"
    posts_dir = "test_posts"
    posts_dir = "_posts"
    filepaths = get_filepaths(posts_dir)
    for filepath in filepaths:
        print("="*20)
        print(filepath)
        urls = find_cnblog_image_urls(filepath)
        for url in urls:
            new_url = download_image_wrapper(url,to_dir)
            replace_inplace(filepath,url,new_url)

def main():
    #regrex_demo()
    process_all_posts()

if __name__ =="__main__":
    main()

"""
grep -r "cnblogs.com/blog" source/_posts
"""    

Reference

History

  • 2019/12/13: created.

Author: kezunlin
Reprint policy: All articles in this blog are used except for special statements CC BY 4.0 reprint polocy. If reproduced, please indicate source kezunlin !
评论
  TOC