需求:分析从富文本编辑器传递到服务端的HTML源码,从HTML源码里检索形如 <img src="attachment/100" /> 或者 <a href="attachment/101" > ... </a>的标签,替换成为<tn-media hash = " ... ">的内部标签存入数据库
原方案:使用正则搜索替换上述标签,实际情况上还是比较容易出错的,因此考虑用DOM树节点分析节点代替正则搜索
环境:python2.7 , webpy , BeautifulSoup (DOM分析工具,第三方库)
#!/usr/bin/env python # -*- coding: utf-8 -*- import web urls = ( '/', 'index', '/parse','parse' ) web.config.debug = False app = web.application(urls, globals()) class index: def GET(self): s = '''''' return s import json from BeautifulSoup import BeautifulSoup class parse: def POST(self): wi = web.input() s = wi.s soup = BeautifulSoup(s) links = soup.findAll('a') a = [] a1 = [] for link in links: a.append(link['href']) if link['href'].find('attachment/') != -1: a1.append(link['href']) s1 ="ALL LINK:%sWE NEED LINK:%s" %(json.dumps(a),json.dumps(a1)) imgs = soup.findAll('img') imgsAll = [] imgsNeed = [] for img in imgs: imgsAll.append(img['src']) if img['src'].find('attachment/') != -1: imgsNeed.append(img['src']) s2 ="ALL IMG:%sWE NEED IMG:%s" %(json.dumps(imgsAll),json.dumps(imgsNeed)) return ''' %s %s %s ''' %(s,s1,s2) if __name__ == "__main__": app.run()
不过因为担心BeautifulSoup不够稳定(字符编码会出错,解析糟糕的HTML标记符出错等因素)所以只采用BeautifulSoup功能的一个子集:让它去检索标签,而不修改DOM树,修改HTML的工作仍然由使用它的程序负责完成
def convertToMedia(self,db,s): ''' 新版代码 编写执行通过 等待测试 需要安装Soup''' def getDigestOfAttchment(db,attId): sql = "select digest from Attachment where attId=$attId" list = db.query(sql,vars=locals()).list() if len(list)>0: return list[0].digest else: return None def convertOtherToMedia(db,links,s): for link in links: #期望的字符串是'/attachment/12' 所以find必然等于0 if link['href'].find('/attachment/') == 0: listPartOfHref = link['href'].split('/') #字符串被拆分成['','attachment','12'] if len(listPartOfHref) != 3: continue if len(listPartOfHref[2]) < 1: continue attId = int(listPartOfHref[2]) digest = getDigestOfAttchment(db,attId) if digest == None : continue s = s.replace(str(link),''%digest) return s def convertPictureToMedia(db,images,s): for image in images: #期望的字符串是'/attachment/12' 所以find必然等于0 if image['src'].find('/attachment/') == 0: listPartOfSrc = image['src'].split('/') #字符串被拆分成['','attachment','12'] if len(listPartOfSrc) != 3: continue if len(listPartOfSrc[2]) < 1: continue attId = int(listPartOfSrc[2]) digest = getDigestOfAttchment(db,attId) if digest == None : continue s = s.replace(str(image),' '%digest) return s from BeautifulSoup import BeautifulSoup soup = BeautifulSoup(s) links = soup.findAll('a') s = convertOtherToMedia(db,links,s) images = soup.findAll('img') s = convertPictureToMedia(db,images,s) return False,s #提供False是因为原来的实现里外部接口需要函数提供,修改外部代码即可取消
BeautifulSoup在DOM树上进行修改的例子
a = soup.findAll('img') for i in a: if i['src'].find('attachment/') > -1: i.replaceWith('')