Python에서 파일에 쓸 때 유니코드 문자를 인코딩할 수 없습니다.

2024-6-3 • tag-icon

여기에서는 Python을 사용하여 로컬에 저장된 HTML 파일에서 성경 구절을 추출하고 있습니다. 실제로 이 Python은 제가 작성한 것이 아니며 어딘가에서 가져온 것입니다. 성경 구절을 추출하기 위해 Python 코드를 실행할 때 다음과 같은 결과가 나타납니다. u'Joseph asopo atum aphan Jakob thek-longsi, alangli arjulo, \u201cLahei komat asomarlo?\u201d'이러한 유형의 줄이 많은 것은 파일에 쓸 때 유니코드 문자를 올바르게 인코딩하지 않습니다. 터미널에서 실행하면 유니코드 문자가 올바르게 표시되고 인코딩됩니다. 누구든지 문제를 해결할 수 있다면 여기에 Python 코드가 있고 여기에 로컬로 저장된 HTML이 있습니다.zip 형식의 html 파일 다운로드 이것은파이썬 파일. 이 Python 코드는 폴더 아래의 HTML 파일을 읽고 example폴더 내에 출력 파일을 생성합니다.example/exampleoutputs

import os, sys
import numpy as np
import array as arr
import re
from lxml import etree
from lxml import html
import urllib2
from bs4 import BeautifulSoup
import csv
import codecs
import shutil
class BIBLE_CLASS:

    def __init__(self,path):
        self.bookslist=[]

    def readBookList(self,path):    # Read the data from file
        file=open(path,'r')
        for x in file:
        #   print(x)
            self.bookslist.append(x)
        #print(Bookslist)
        #for i in range(1,len(self.bookslist)):
        #   print(self.bookslist[i])

    def searchBook(self,book):  # Search the book ID in the books file
        #bookcode=0
        for i in range(1,len(self.bookslist)):
            st=str(self.bookslist[i])
            #st=st.upper()
            #skey=book.upper()
            #print(st,'',skey)
            if(st.find(book)!=-1):
                temp=self.bookslist[i]
                index1=temp.index(',')
                #print(temp[0:index1])  
                bookcode=temp[0:index1]
                #print(bookcode)
                return bookcode

    def writeStoryFile(self,path):
        # Writes data into story file
        filext1=['.html']
        print("writeStoryFile")
        dirs=os.listdir(path)
            pattern=re.compile("^v")
        # CREATE A FOLDER WITH PATH+OUTPUTS AS NAME
        outputfile=path+"outputs"
        outpath=os.path.join(path,outputfile)
        if os.path.exists(outpath):
            shutil.rmtree(outpath)
        else:
            os.mkdir(outpath)
        for d in dirs:  #For each directory extract stories from files
            print("Directory name:",d)
            btitle=d[d.rindex('_')+1:]
            print(btitle)
            print('Book code:',o1.searchBook(btitle))   # Book code is extracted
            fullpath=os.path.join(path,d)
            if os.path.isdir(fullpath):
                print("Converting folder",fullpath)
                files=os.listdir(fullpath)
            rno=0
            # GET THE BOOK CODE FROM THE FILE
            btitle=d[d.rindex('_')+1:]   
            #print(btitle)  
            #print('Book code:',o1.searchBook(btitle))
            bcode=o1.searchBook(btitle)
            print(bcode)

            # CREATE STORY LINE FILE FOR EACH FOLDER
            csvfile1=outpath+"/"+d+"story.csv"  # story line filename
            f1 = codecs.open(csvfile1, encoding='utf-8',mode='w')   # Creating story lines file
            csvfile2=outpath+"/"+d+"storyverses.csv"    # story line with verses files
            csvfile3=outpath+"/"+d+"veses.csv"   # only verses      
            f2 = codecs.open(csvfile2, encoding='utf-8',mode='w')
            f3=codecs.open(csvfile3,  encoding='utf-8',mode='w')    
            for f in files:
                fname,fext=os.path.splitext(f)
                rows=[]

                if fext in filext1:
                    print("=================Processing the file",f,"========================")
                    # Process the file
                    print "Stories in the chapter:" 
        #           fullpath=os.path.join(path,file)
                    wpath=os.path.join(fullpath,f)
                    text=open(wpath,"r")
        #           csvfile1=wpath.replace('.html','story.csv') # story line filename
                        html_doc=text.read()    
                        soup = BeautifulSoup(html_doc, 'html.parser',from_encoding="utf-8")
                        h3s = soup.find_all('h3')
                        storyverses=[]
                        storytitles=[]  

                        for h3 in h3s:
                        next_element = h3.find_next()
                        print next_element.text

                        if next_element.find('sup') is None:
                        print("Element is null")
                        required_element=next_element.find_next()
                        if required_element is None:
                            required_element=next_element.next_element.find_next()
                        if required_element.find('sup') is None:
                            break
                        required_element= required_element.find_next('sup')

                        superscript_number = str(required_element.find_next('sup').text)                    
                     #   print bcode,fname,superscript_number,h3.text                           
                        print "========",superscript_number
                        if len(superscript_number)>2:
                            storyverses.append(int(superscript_number[0:1]))
                            sn=superscript_number[0:1]  
                            print sn
                            st=int(bcode),int(fname),int(superscript_number[0:1]), str(h3.text)
                        else:
                            storyverses.append(int(superscript_number))
                            st=int(bcode),int(fname),int(superscript_number), (h3.text).encode('utf-8')

                        storytitles.append(h3.text) 
                        print st    
                        rows.append(st)
                        rno=rno+1   
                        else:

                        superscript_number = str(next_element.find('sup').text)                 
                        if len(superscript_number)>2:
                            storyverses.append(int(superscript_number[0:1]))
                            sn=superscript_number[0:1]  
                            print sn
                            st=int(bcode),int(fname),int(superscript_number[0:1]), str(h3.text)
                        else:
                            storyverses.append(int(superscript_number))
                            st=int(bcode),int(fname),int(superscript_number), str(h3.text)

                        storytitles.append(h3.text) 
                        print st    
                        rows.append(st)
                        rno=rno+1
                        #print storytitles[0]   
                    # Write story lines into file 
                #   f1 = codecs.open(csvfile1, encoding='utf-8',mode='w')   # Creating story lines file
                    for row in rows:               
                        f1.write(str(row))
                        f1.write("\n")

                #   csvfile2=wpath.replace('.html','verse.csv') # story line files
                #       csvfile3=wpath.replace('.html','veses.csv')     
#                       f2=open(csvfile2,"w")
                    verses=[]   
                    verseno=1   
                 #      f2 = codecs.open(csvfile2, encoding='utf-8',mode='w')
                 #      f3=codecs.open(csvfile3,  encoding='utf-8',mode='w')    
                    #f2=codecs.open(csvfile2,mode='w')
                    #f3=codecs.open(csvfile3,mode='w')
                        print "======================================================================================"  
                        k=0 
                        for y in soup.findAll('span',class_=[pattern,"heading"]):
                        sups=y('sup')
                        for z in sups:
                            z.decompose()
                        if verseno in storyverses:
                            st1=str(rows[storyverses.index(verseno)])
                            k=k+1
                            f2.write(st1)
                            f2.write("\n")
                            verses.append(st1)
                            print bcode,fname, verseno,'"'+y.text+'"'
                            st=int(bcode),int(fname), verseno,y.text
                            f2.write(str(st))
                            f2.write("\n")
                            f3.write(str(st))
                            f3.write("\n")
                            verses.append(st)
        #       print st
                        else:
                            print bcode,fname, verseno,'"'+y.text+'"'   
                            st=int(bcode),int(fname),verseno,y.text
                            f2.write(str(st))
                            f2.write("\n")
                            f3.write(str(st))
                            f3.write("\n")
                            verses.append(st)
                        verseno+=1
                #   f1.close()
                #   f2.close()
                #   f3.close()
    def writeVerses(self):  
        print("writeVerses")
        # Writes verses into verse file

    def writebothVerse(self):
        print("writeStoryFile")
        # Writes verse with story in between

# Main program
bookspath="BookCode/books.csv"
path='example'   # Change this to the actual path of the input data

o1=BIBLE_CLASS(path)
# Step 1: Read the book list from the file
o1.readBookList(bookspath)

# Step 2: Create story files
o1.writeStoryFile(path)
filext1=['.html']
dirs=os.listdir(path)

다음은 출력 파일의 일부 스크린샷입니다.

관련 정보