0%

how to find string and replace with python regex

Regex Online Demo

regex online demo

Code Example

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
#!/usr/bin/python
# -*- coding: UTF-8 -*-

import os
import re

import shutil
import requests

def regrex_demo():
pattern = r'https://img201[7-9].cnblogs.com/blog/\d{6}/20\d{4}/\d{6}-\d{,}-\d{,}.png'
string = """![flops](https://kezunlin.me/images/posts/635233-20190912095826925-710547982.png)
hello world
![flops](https://kezunlin.me/images/posts/635233-20190912095826925-710547982.png)"""
results = re.findall(pattern,string) # list
for url in results:
print(url)

def get_filepaths(root_dir):
filepaths = []
for filename in os.listdir(root_dir):
filepath = os.path.sep.join([root_dir, filename])
filepaths.append(filepath)
return filepaths

def makesure_dir(dir):
if not os.path.exists(dir):
os.makedirs(dir)

def find_cnblog_image_urls(filepath):
# img2017 img2018 img2019
# images2017 images2018 images2019
#pattern = r'https*://img201[0-9].cnblogs.com/blog/\d{6,8}/20\d{4}/\d{6,8}-\d{,}-\d{,}.png'
#pattern = r'https*://img201[0-9].cnblogs.com/blog/\d{6,8}/20\d{4}/\d{6,8}-\d{,}-\d{,}.jpg'

#pattern = r'https*://images201[0-9].cnblogs.com/blog/\d{6,8}/20\d{4}/\d{6,8}-\d{,}-\d{,}.png'
pattern = r'https*://images201[0-9].cnblogs.com/blog/\d{6,8}/20\d{4}/\d{6,8}-\d{,}-\d{,}.jpg'

urls = []
with open(filepath,"r") as f:
contents = f.read().replace('\n', '') # read file into one string
urls = re.findall(pattern,contents) # list
return urls

def download_image(url, to_file):
r = requests.get(url, stream=True)
if r.status_code == 200:
with open(to_file, 'wb') as f:
#r.raw.decode_content = True
shutil.copyfileobj(r.raw, f)
print("Save to ",to_file)

def download_image_by_chunk(url, to_file):
r = requests.get(url, stream=True)
if r.status_code == 200:
with open(to_file, 'wb') as f:
for chunk in r.iter_content(1024):
f.write(chunk)
print("Save to ",to_file)

def replace_inplace(filepath, old_string, new_string):
f = open(filepath,'r')
filedata = f.read()
f.close()

newdata = filedata.replace(old_string,new_string)

f = open(filepath,'w')
f.write(newdata)
f.close()

def download_image_wrapper(url, to_dir):
"""
donwnload image from url and return new url
"""
filename = url.split("/")[-1]
to_file = os.path.sep.join([to_dir, filename])
download_image(url,to_file)

new_url = "https://kezunlin.me/images/posts/{}".format(filename)
return new_url

def process_all_posts():
to_dir = "images/posts" # images dir
makesure_dir(to_dir)

#posts_dir = "../source/_posts/"
posts_dir = "test_posts"
posts_dir = "_posts"
filepaths = get_filepaths(posts_dir)
for filepath in filepaths:
print("="*20)
print(filepath)
urls = find_cnblog_image_urls(filepath)
for url in urls:
new_url = download_image_wrapper(url,to_dir)
replace_inplace(filepath,url,new_url)

def main():
#regrex_demo()
process_all_posts()

if __name__ =="__main__":
main()

"""
grep -r "cnblogs.com/blog" source/_posts
"""

Reference

History

  • 2019/12/13: created.