背景
在实际项目中经常遇到子串查找或者匹配的问题。即查找子串test_sub
在原始文本test_str
中的索引位置。进行直接给出各方案的评测对比。
各方案对比
各种常见的子串匹配方案如下:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2023/6/18 16:23
# @Author : JasonLiu
# @File : test_match_index.py
import pdb
from functools import reduce
from timeit import timeit
import re
import time
def find_index_startswith(test_str, test_sub):
# using list comprehension + startswith()
# All occurrences of substring in string
res = [i for i in range(len(test_str)) if test_str.startswith(test_sub, i)]
return res
def find_index_finditer(test_str, test_sub):
res = [i.start() for i in re.finditer(test_sub, test_str)]
return res
def find_index_replace(test_str, test_sub):
res = []
while (test_str.find(test_sub) != -1):
res.append(test_str.find(test_sub))
test_str = test_str.replace(test_sub, "*" * len(test_sub), 1)
return res
# 最快
def find_substring_indices(string, substring):
# Initialize an empty list to store the start indices of the substrings
indices = []
# Start searching for the substring from the beginning of the string
start_index = 0
# Continue searching until the substring is not found in the remaining part of the string
while True:
# Find the next occurrence of the substring starting from the current start_index
index = string.find(substring, start_index)
if index == -1:
# If the substring is not found, break out of the loop
break
else:
# If the substring is found, add its start index to the list of indices
indices.append(index)
# Update the start index to start searching for the next occurrence of the substring
start_index = index + 1
# Return the list of indices
return indices
res = find_substring_indices("我是卖核弹的小男孩", "核弹")
# pdb.set_trace()
def find_all_substrings(string, substring):
# Initialize an empty list to store the indices of all occurrences of the substring.
indices = []
# Set the starting index i to 0.
i = 0
# Use a while loop to keep searching for the substring in the string.
while i < len(string):
# Use the find() method to find the first occurrence of the substring in the string, starting from the current index i.
j = string.find(substring, i)
# If find() returns -1, it means that there are no more occurrences of the substring in the string, so break out of the loop.
if j == -1:
break
# If find() returns a non-negative value, append the index of the first character of the substring to the list,
# and update the starting index i to the next character after the end of the substring.
indices.append(j)
i = j + len(substring)
# Return the list of indices.
return indices
def find_index_finditer_reduce(test_str, test_sub):
# using re.finditer() to find all occurrences of substring in string
occurrences = re.finditer(test_sub, test_str)
# using reduce() to get start indices of all occurrences
res = reduce(lambda x, y: x + [y.start()], occurrences, [])
return res
def find(raw_string, short_text):
if raw_string.find(short_text) > -1:
pass
def re_find(raw_string, short_text):
if re.match(short_text, raw_string):
pass
# 最快
def best_find(raw_string, short_text):
if short_text in raw_string:
pass
# number: stmt执行的次数,默认是1000000,100万
# print(timeit("find(string, text)", "from __main__ import find; string='lookforme'; text='look'"))
# print(timeit("re_find(string, text)", "from __main__ import re_find; string='lookforme'; text='look'"))
# print(timeit("best_find(string, text)", "from __main__ import best_find; string='lookforme'; text='look'"))
print(timeit("find_index_startswith(string, text)",
"from __main__ import find_index_startswith; string='我是卖核弹的小男孩'; text='小男孩'"))
print(timeit("find_index_finditer(string, text)",
"from __main__ import find_index_finditer; string='我是卖核弹的小男孩'; text='小男孩'"))
print(timeit("find_index_replace(string, text)",
"from __main__ import find_index_replace; string='我是卖核弹的小男孩'; text='小男孩'"))
print(timeit("find_substring_indices(string, text)",
"from __main__ import find_substring_indices; string='我是卖核弹的小男孩'; text='小男孩'"))
print(timeit("find_all_substrings(string, text)",
"from __main__ import find_all_substrings; string='我是卖核弹的小男孩'; text='小男孩'"))
print(timeit("find_index_finditer_reduce(string, text)",
"from __main__ import find_index_finditer_reduce; string='我是卖核弹的小男孩'; text='小男孩'"))
start_time = time.time()
find_substring_indices("我是卖核弹的小男孩", "小男孩")
end_time = time.time()
print("find_substring_indices cost=", end_time - start_time)
start_time = time.time()
res = find_all_substrings("我是卖核弹的小男孩", "小男孩")
print("res=", res)
end_time = time.time()
print("find_all_substrings cost=", end_time - start_time)
运行结果如下:
1.2356753833591938
0.8407432101666927
0.5224904119968414
0.29449306428432465
0.2958533354103565
1.0457346253097057
find_substring_indices cost= 2.86102294921875e-06
res= [6]
find_all_substrings cost= 6.9141387939453125e-06
可以看出find_substring_indices
和find_all_substrings
这两种方式查找最快。