最近对明日方舟剧情很感兴趣,但游戏终有关服的一天,把剧情文本搞到本地才有安全感。于是到网上找到相关剧情,发现要么不能复制,要么会复制到奇怪的东西,反正都是麻烦,索性就用爬虫给爬下来了。
一个简单的爬虫程序,但过程中还挺麻烦的,涉及到正则表达式,文件的读写。新学到readlines的使用方法,感觉很巧妙。主要内容都写在注释里了,不再赘述。
代码 提取活动文本(建议B站Wiki)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 import reimport requestsdef Askurl (url ): head = { "User-Agent" : "Mozilla / 5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 92.0.4515.107Safari / 537.36Edg / 92.0.902.55" } req = requests.get(url=url, headers=head) response = req.content.decode("utf-8" ) return response def is_all_chinese (strs ): for i in strs: if not '\u4e00' <= i <= '\u9fa5' : if i!="?" and i!="," and i!="。" and i!="“" and i!="”" and i!="—" and i!="." : return False return True def Gtedata (baseurl ): html = Askurl(baseurl) ff = open ('剧情文本.txt' , 'w' , encoding='utf-8' ) ff.write(html) f = open ('吾导先路.txt' , 'r' , encoding='utf-8' ).readlines() FindText = re.compile ('\[name=".*?"].*' ) FindName = re.compile ('\[name="(.*?)"]' ) FindIn = re.compile ('](.*)' ) FindSub = re.compile ('\[Subtitle\(text="(.*?)"' ) FindTitle = re.compile ('MASK"\)] (.*)' ) for item in f: item = str (item) result = FindText.findall(item) result1 = FindSub.findall(item) result2 = FindTitle.findall(item) if len (result) != 0 : Name = FindName.findall(result[0 ]) In = FindIn.findall(result[0 ]) print ("【" +Name[0 ]+"】" +": " +In[0 ]) elif len (result1)!=0 : print (result1[0 ]) elif len (result2)!=0 : print ("" ) print (result2[0 ]) print ("" ) elif is_all_chinese(item.replace('\n' , '' ).replace('\r' , '' ).replace('\t' ,'' ))and len (item)!=0 : text = item.replace('\n' , '' ).replace('\r' , '' ).replace('\t' ,'' ) if (text!="" ): print (text) def main (): baseurl="https://prts.wiki/index.php?title=10-1_%E8%A2%AB%E8%BF%BD%E9%80%90%E8%80%85/NBT&action=edit" Gtedata(baseurl) if __name__ == "__main__" : main()
提取主线文本
与上面不同,由于主线章节在wiki中多以小节的方式呈现难以整合,于是修改了一下可以批量获取链接。(如果不嫌麻烦依然可以用上面的方法)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 import reimport requestsdef Askurl (url ): head = { "User-Agent" : "Mozilla / 5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 92.0.4515.107Safari / 537.36Edg / 92.0.902.55" } req = requests.get(url=url, headers=head) response = req.content.decode("utf-8" ) return response def is_all_chinese (strs ): for i in strs: if not '\u4e00' <= i <= '\u9fa5' : if i!="?" and i!="," and i!="。" and i!="“" and i!="”" and i!="—" and i!="." : return False return True def Gtedata (baseurl ): html = Askurl(baseurl) ff = open ('剧情文本.txt' , 'w' , encoding='utf-8' ) ff.write(html) f = open ('剧情文本.txt' , 'r' , encoding='utf-8' ).readlines() FindText = re.compile ('\[name=".*?"].*' ) FindName = re.compile ('\[name="(.*?)"]' ) FindIn = re.compile ('](.*)' ) FindSub = re.compile ('\[Subtitle\(text="(.*?)"' ) FindTitle = re.compile ('MASK"\)] (.*)' ) for item in f: item = str (item) result = FindText.findall(item) result1 = FindSub.findall(item) result2 = FindTitle.findall(item) if len (result) != 0 : Name = FindName.findall(result[0 ]) In = FindIn.findall(result[0 ]) print ("【" +Name[0 ]+"】" +": " +In[0 ]) elif len (result1)!=0 : print (result1[0 ]) elif len (result2)!=0 : print ("" ) print (result2[0 ]) print ("" ) elif is_all_chinese(item.replace('\n' , '' ).replace('\r' , '' ).replace('\t' ,'' ))and len (item)!=0 : text = item.replace('\n' , '' ).replace('\r' , '' ).replace('\t' ,'' ) if (text!="" ): print (text) def main (): f = open ('链接.txt' , 'r' , encoding='utf-8' ).read() FindA = re.compile (r'<a (.*?)/a>' ) FindLink = re.compile (r'href="/w/(.*?)" title=' ) FindTitle = re.compile (r'">(.*?)<' ) A = FindA.findall(f) for item in A: link = FindLink.findall(item)[0 ] Title = FindTitle.findall(item)[0 ] link = "https://prts.wiki/index.php?title=" +link+"&action=edit" print ("\n" ) print (Title) print ("\n" ) Gtedata(link) if __name__ == "__main__" : main()