亚洲在线久爱草,狠狠天天香蕉网,天天搞日日干久草,伊人亚洲日本欧美

為了賬號安全,請及時綁定郵箱和手機立即綁定

基于Python正則表達式的正文日期識別算法

根据我的上一篇博客:http://www.xianlaiwan.cn/search/article?words=迷之语法
我又写了一个简单应用来识别并提取一些文章或者通知的重要日期,目前调试结果还行,但是还存在一些漏洞,比如同时识别好几个日期,还没加条件来判断哪个更重要,阅者可以发现并根据需要来完善。

完整代码:

import re 
import pandas as pd 


pattern1 = re.compile(r'(\d+年+)*(\d+月+)+(\d+日+)+[[\(*本\(*周一-日\)*]*[\(*星期一-天\)*]*]*[上中下午晚早上]*(([0123456789101112]+时+)|([0123456789101112]+(点半|点)+(\d+分)*)|([一-十二]+(点半|点)+)|([01]?[0-9]|2[0-4])(:|:)+([0-5][0-9]))(--|-次日|-|至次日|至)*(([0123456789101112]+时+)|([0123456789101112]+(点半|点)+(\d+分)*)|([一-十二]+(点半|点)+)|([01]?[0-9]|2[0-4])(:|:)+([0-5][0-9]))*')
pattern16 = re.compile(r'(\d+年+)*(\d+月+)*(\d+日+)*[[\(*本\(*周一-日\)*]*[\(*星期一-天\)*]*]+[上中下午晚早上]*(([0123456789101112]+时+)|([0123456789101112]+(点半|点)+(\d+分)*)|([一-十二]+(点半|点)+)|([01]?[0-9]|2[0-4])(:|:)+([0-5][0-9]))(--|-次日|-|至次日|至)*(([0123456789101112]+时+)|([0123456789101112]+(点半|点)+(\d+分)*)|([一-十二]+(点半|点)+)|([01]?[0-9]|2[0-4])(:|:)+([0-5][0-9]))*')

pattern2 = re.compile(r'[[\(*本\(*周一-日\)*]*[\(*星期一-天\)*]*]+(\d+年+)*(\d+月+)*(\d+日+)*[上中下午晚早上]*(([0123456789101112]+时+)|([0123456789101112]+(点半|点)+)|([一-十二]+(点半|点)+)|([01]?[0-9]|2[0-4])(:|:)+([0-5][0-9]))(--|-次日|-|至次日|至)*(([0123456789101112]+时+)|([0123456789101112]+(点半|点)+)|([一-十二]+(点半|点)+)|([01]?[0-9]|2[0-4])(:|:)+([0-5][0-9]))*')
pattern3 = re.compile(r'(\d+年)*(\d+月)*(\d+日)*[[\(*本\(*周一-日\)*]*[\(*星期一-天\)*]*]+([01]?[0-9]|2[0-4])(:|:)+([0-5][0-9])*(-)*([01]?[0-9]|2[0-4])(:|:)+([0-5][0-9])*')
pattern4 = re.compile(r'(\d+\-\d+\-\d+)+(.)*[上中下午晚早上]*([01]?[0-9]|2[0-4])(:|:)+([0-5][0-9])*(-)*([01]?[0-9]|2[0-4])(:|:)+([0-5][0-9])*')
pattern5 = re.compile(r'(\d+年+)*(\d+月+)*(\d+日+)*(下班前|之前|前)+[[\((本|下)\(周一-日\)]*[\(星期一-天\)]*]+(下班前|之前|前)+')
pattern6 = re.compile(r'(\d+年+)*(\d+月+)*(\d+日+)*(下班前|之前|前)*[[\((本|下)\(周一-日\)]*[\(星期一-天\)]*]+[上中下午晚早上]*(([0123456789101112]+时+)|([0123456789101112]+(点半|点)+)|([一-十二点]+)|([01]?[0-9]|2[0-4])(:|:)+([0-5][0-9]))*')
pattern7 = re.compile(r'(\d+年+)*(\d+月+)*(\d+日+)*[[\((本|下)\(周一-日\)]*[\(星期一-天\)]*]+[上中下午晚早上]*(\d+(点半|点)+)+(\d+分)+')
pattern8 = re.compile(r'(\d+年+)*(\d+月+)*(\d+日+)*[[\((本|下)\(周一-日\)]*[\(星期一-天\)]*]+[\(]*(([01]?[0-9]|2[0-4])(:|:)+([0-5][0-9]))*(-|至)*(([01]?[0-9]|2[0-4])(:|:)+([0-5][0-9]))*[\)]*')

pattern9 = re.compile(r'[[\(*本\(*周一-日\)*]*[\(*星期一-天\)*]*]*(\d+年+)*(\d+月+)+(\d+日+)+[上中下午晚早上]+(([0123456789101112]+时+)|([0123456789101112]+(点半|点)+)|([一-十二]+(点半|点)+)|([01]?[0-9]|2[0-4])(:|:)+([0-5][0-9]))(--|-次日|-|至次日|至)*(([0123456789101112]+时+)|([0123456789101112]+(点半|点)+)|([一-十二]+(点半|点)+)|([01]?[0-9]|2[0-4])(:|:)+([0-5][0-9]))*')
pattern10 = re.compile(r'(\d+年)*(\d+月)+(\d+日)+[[\(*本\(*周一-日\)*]*[\(*星期一-天\)*]*]*([01]?[0-9]|2[0-4])(:|:)+([0-5][0-9])*(-)*([01]?[0-9]|2[0-4])(:|:)+([0-5][0-9])*')
pattern11 = re.compile(r'(\d+年+)*(\d+月+)+(\d+日+)+(.)+[上中下午晚早上]*([01]?[0-9]|2[0-4])(:|:)+([0-5][0-9])*(-)*([01]?[0-9]|2[0-4])(:|:)+([0-5][0-9])*')
pattern12 = re.compile(r'(\d+年+)*(\d+月+)+(\d+日+)+(下班前|之前|前)+[[\((本|下)\(周一-日\)]*[\(星期一-天\)]*]*(下班前|之前|前)+')
pattern18 = re.compile(r'(\d+年+)*(\d+月+)+(\d+日+)+(下班前|之前|前)+[[\((本|下)\(周一-日\)]*[\(星期一-天\)]*]*(下班前|之前|前)*')
pattern19 = re.compile(r'(\d+年+)*(\d+月+)+(\d+日+)+(下班前|之前|前)*[[\((本|下)\(周一-日\)]*[\(星期一-天\)]*]*(下班前|之前|前)+')
pattern13 = re.compile(r'(\d+年+)*(\d+月+)+(\d+日+)+(下班前|之前|前)+[[\((本|下)\(周一-日\)]*[\(星期一-天\)]*]*[上中下午晚早上]*(([0123456789101112]+时+)|([0123456789101112]+(点半|点)+)|([一-十二点]+)|([01]?[0-9]|2[0-4])(:|:)+([0-5][0-9]))*')
pattern20 = re.compile(r'[明后天]+(\()+(\d+月+)+(\d+日+)+(\))+(全天)+')
pattern14 = re.compile(r'(\d+年+)*(\d+月+)+(\d+日+)+[[\((本|下)\(周一-日\)]*[\(星期一-天\)]*]*[上中下午晚早上]*(\d+(点半|点)+)+(\d+分)+')
pattern15 = re.compile(r'(\d+年+)*(\d+月+)+(\d+日+)+[[\((本|下)\(周一-日\)]*[\(星期一-天\)]*]*[\(]*(下班前|之前|前)+')

pattern17 = re.compile(r'(\d+年+)*(\d+月+)*(\d+日+)*[[\(*本\(*周一-日\)*]*[\(*星期一-天\)*]*]*[上中下午晚早上]+(([0123456789101112]+时+)|([0123456789101112]+(点半|点)+)|([一-十二]+(点半|点)+)|([01]?[0-9]|2[0-4])(:|:)+([0-5][0-9]))(--|-次日|-|至次日|至)*(([0123456789101112]+时+)|([0123456789101112]+(点半|点)+)|([一-十二]+(点半|点)+)|([01]?[0-9]|2[0-4])(:|:)+([0-5][0-9]))*')

path = "E:/NLP/DataSet/Data"
df1 = pd.read_excel(r'E:/NLP/DataSet/Data/data.xlsx',sheet_name = 0)
df = df1.iloc[:,0] 
re_time = []
for i in range(len(df)):
	lines = str(df[i]).split("于")
	result = []
	for line in lines:
		a1 = line
		m1 = pattern1.search(a1)
		m2 = pattern2.search(a1)
		m3 = pattern3.search(a1)
		m4 = pattern4.search(a1)
		m5 = pattern5.search(a1)
		m6 = pattern6.search(a1)
		m7 = pattern7.search(a1)
		m8 = pattern8.search(a1)
		m9 = pattern9.search(a1)
		m10 = pattern10.search(a1)
		m11 = pattern11.search(a1)
		m12 = pattern12.search(a1)
		m13 = pattern13.search(a1)
		m14 = pattern14.search(a1)
		m15 = pattern15.search(a1)
		m16 = pattern16.search(a1)
		m17 = pattern17.search(a1)
		m18 = pattern18.search(a1)
		m19 = pattern19.search(a1)
		m20 = pattern20.search(a1)

		#按照复杂度,对正则表达式的判断顺序排了个序		
		#顺序越靠前,筛选条件越苛刻
		if m4:
			print(m4.group(0))
			b = re.search('\d',m4.group(0))
			if b:
				result.append(m4.group(0))	
		elif m5:
			print(m5.group(0))
			b = re.search('\d',m5.group(0))
			if b:
				result.append(m5.group(0))
		elif m7:
			print(m7.group(0))
			b = re.search('\d',m7.group(0))
			if b:
				result.append(m7.group(0))		
		elif m1:
			print(m1.group(0))
			b = re.search('\d',m1.group(0))
			if b:
				result.append(m1.group(0))	
		elif m16:
			print(m16.group(0))
			b = re.search('\d',m16.group(0))
			if b:
				result.append(m16.group(0))	
		elif m2:
			print(m2.group(0))
			b = re.search('\d',m2.group(0))
			if b:
				result.append(m2.group(0))
		elif m3:
			print(m3.group(0))
			b = re.search('\d',m3.group(0))
			if b:
				result.append(m3.group(0))
		elif m6:
			print(m6.group(0))
			b = re.search('\d',m6.group(0))
			if b:
				result.append(m6.group(0))
		elif m8:
			print(m8.group(0))
			b = re.search('\d',m8.group(0))
			if b:
	 			result.append(m8.group(0))
		elif m9:
			print(m9.group(0))
			b = re.search('\d',m9.group(0))
			if b:
	 			result.append(m9.group(0))
		elif m10:
			print(m10.group(0))
			b = re.search('\d',m10.group(0))
			if b:
	 			result.append(m10.group(0))
		elif m12:
			print(m12.group(0))
			b = re.search('\d',m12.group(0))
			if b:
	 			result.append(m12.group(0))
		elif m18:
			print(m18.group(0))
			b = re.search('\d',m18.group(0))
			if b:
	 			result.append(m18.group(0))
		elif m19:
			print(m19.group(0))
			b = re.search('\d',m19.group(0))
			if b:
	 			result.append(m19.group(0))
		elif m13:
			print(m13.group(0))
			b = re.search('\d',m13.group(0))
			if b:
	 			result.append(m13.group(0))
		elif m20:
			print(m20.group(0))
			b = re.search('\d',m20.group(0))
			if b:
	 			result.append(m20.group(0))
		elif m14:
			print(m14.group(0))
			b = re.search('\d',m14.group(0))
			if b:
	 			result.append(m14.group(0))
		elif m15:
			print(m15.group(0))
			b = re.search('\d',m15.group(0))
			if b:
	 			result.append(m15.group(0))
		elif m17:
			print(m17.group(0))
			b = re.search('\d',m17.group(0))
			if b:
	 			result.append(m17.group(0))
		elif m11:
			print(m11.group(0))
			b = re.search('\d',m11.group(0))
			if b:
	 			result.append(m11.group(0))

	 #当列表有不止一个字符串时,保留其中最长的字符串
	if len(result) >= 1:
		
		#step1:对result[i]进行关键字过滤,和时间无关的字符被删掉
		#只保留以下字符:
		#中文:年月日星期本周上中下午晚早上一二两三四五六十时点半分次至下班之前明后全天
		#数字:0123456789
		#符号: :;()()--—- :;,,
		#完整的:年月日星期本周上中下午晚早上一二两三四五六十时点半分次至下班之前明后全天m0123456789:;()()--—- :;,,
		for i in range(len(result)):
			a_shanchu = result[i]
			list_re= re.findall('[^年月日星期本周上中下午晚早上一二两三四五六十时点半分次至下班之前明后全天m0123456789:;()()--—- :;,,]',a_shanchu)
			if len(list_re) > 0:
				for j in range(len(list_re)):
					if len(list_re[j]) > 0 :
						a_shanchu = re.sub(list_re[j],'',a_shanchu)
			result[i] = a_shanchu
		#step2:按长度比较‘时间’
	if len(result) > 1:
		a = ' '
		for i in range(len(result)):
			if len(result[i]) > len(a):
				a = result[i]
		result = [a] 
	
	#将"下班前"或者'前'字符串替换成下班时间:"17:00"
	if len(result) > 0:
		tihuan = re.findall('下班前',result[0])
		if tihuan:
			result[0] = re.sub('下班前','17:00',result[0])
	if len(result) > 0:
		tihuan = re.findall('日前',result[0])
		if tihuan:
			result[0] = re.sub('前','17:00',result[0])
	#删除前缀,Sub:Count = 1
	if len(result) > 0:
		tihuan = re.findall('时',result[0])
		if len(tihuan) > 2:
			result[0] = re.sub('时','',result[0],1)
	if len(result) > 0:
		tihuan = re.findall('间',result[0],1)
		if len(tihuan) > 2:
			result[0] = re.sub('间','',result[0],1)
	if len(result) > 0:
		tihuan = re.findall(':',result[0],1)
		if len(tihuan) > 2:
			result[0] = re.sub(':','',result[0],1)

	re_time.append(result)

re_time = pd.DataFrame(re_time,index = df.index)
out_time = pd.concat([df1,re_time],axis = 1)
out_time.to_excel('out_time.xlsx')
print ('Done')

为了方便,本程序的输入和输出都为Excel文件

运行结果:
图片描述

點擊查看更多內容
TA 點贊

若覺得本文不錯,就分享一下吧!

評論

作者其他優質文章

正在加載中
  • 推薦
  • 評論
  • 收藏
  • 共同學習,寫下你的評論
感謝您的支持,我會繼續努力的~
掃碼打賞,你說多少就多少
贊賞金額會直接到老師賬戶
支付方式
打開微信掃一掃,即可進行掃碼打賞哦
今天注冊有機會得

100積分直接送

付費專欄免費學

大額優惠券免費領

立即參與 放棄機會
微信客服

購課補貼
聯系客服咨詢優惠詳情

幫助反饋 APP下載

慕課網APP
您的移動學習伙伴

公眾號

掃描二維碼
關注慕課網微信公眾號

舉報

0/150
提交
取消