336x280(권장), 300x250(권장), 250x250, 200x200 크기의 광고 코드만 넣을 수 있습니다.

https://www.crummy.com/software/BeautifulSoup/



설치


from bs4 import BeautifulSoup
import re
html="""
<html>
<head>
<title> test web </title>
</head>
<body>
<p align="center"> text contents 1 </p>
<p align="right"> text contents 2 </p>
<p align="left"> text contents 3 </p>
<p align="center"> text contents 4 </p>
<img src="c:\Python34\Koala.jpg" width="500" height="300">
</body>
</html> """

bs = BeautifulSoup(html)
print (bs.prettify())

# find : 태그를 하나만 가져옴
print (bs.find('title')) # <title> test web </title>
print (bs.find('p', align='center')) # <p align="center"> text contents 1 </p>
print (bs.find('p', align='right')) # <p align="right"> text contents 2 </p>
print (bs.find('p', align='left')) # <p align="left"> text contents 3 </p>

print(bs.find_all('p')) # [<p align="center"> text contents 1 </p>, <p align="right"> text contents 2 </p>, <p align="left"> text contents 3 </p>, <p align="center"> text contents 4 </p>]

head_tag = bs.find('head')
print (head_tag.find('title')) # <title> test web </title>
print (head_tag.find('p')) # None : 반드시 자기 안에 있는 태그만 가져 올 수 있다

body_tag = bs.find('body')
list1 = body_tag.find_all(['p', 'img'])

for tag in list1:
print(tag)
'''
<p align="center"> text contents 1 </p>
<p align="right"> text contents 2 </p>
<p align="left"> text contents 3 </p>
<p align="center"> text contents 4 </p>
<img height="300" src="c:\Python34\Koala.jpg" width="500"/>
'''

tags = bs.find_all(re.compile('^p'))
print(tags)

# find_all
# 인수로 태그 이름 말고 속성 문장, limit등을 전달 할 수 있다
print(bs.find_all(align='center')) # [<p align="center"> text contents 1 </p>, <p align="center"> text contents 4 </p>]
print(bs.find_all(width='500')) # [<img height="300" src="c:\Python34\Koala.jpg" width="500"/>]
print(bs.find_all(text = ' text contents 1 ')) # [' text contents 1 ']
print(bs.find_all(text = re.compile('text+'))) # [' text contents 1 ', ' text contents 2 ', ' text contents 3 ', ' text contents 4 ']

# limit 인수 : 태그의 개수를 제한
print(bs.find_all('p', limit=2)) # [<p align="center"> text contents 1 </p>, <p align="right"> text contents 2 </p>]

print()
print ('*' * 50)
print ('*' * 50)
# 문장 가져오기
body_tag = bs.find('body')
p_tag = body_tag.find('p')
print (p_tag.string) # text contents 1 --> string은 한번에 한 문장 밖에 가져오지 못한다

# 모든 문장 가져오는 방법
strings = body_tag.strings
for string in strings:
print (string)

'''
text contents 1


text contents 2


text contents 3


text contents 4
'''


# 태그에서의 여러 문자열을 하나의 문자열로 출력
print(body_tag.get_text())
'''
text contents 1
text contents 2
text contents 3
text contents 4
'''

# 중간에 들어간 줄 바꿈 기호가 모두 삭제
print(body_tag.get_text(strip=True)) # text contents 1text contents 2text contents 3text contents 4
print(body_tag.get_text('-', strip=True)) # text contents 1-text contents 2-text contents 3-text contents 4

# 태그의 속성
# Beautiful Soup에서는 HTML의 class속성을 가져올 수 있을 뿐 아니라 태그의 속성을 추가, 삭제 변경 모두 가능함
html="""
<html>
<head>
<title> test web </title>
</head>
<body>
<p class="ptag black" align="center"> text contents 1 </p>
<p class="ptag yellow" align="center"> text contents 2 </p>
<p class="ptag red" align="center"> text contents 3 </p>
<img src="c:\Python34\Koala.jpg" width="500" height="300">
</body>
</html> """

bs = BeautifulSoup(html)
p_tag = bs.find('p')
print(p_tag['class']) # ['ptag', 'black']

# 속성 변경
p_tag['class'][1] = 'white'
print(p_tag['class']) # ['ptag', 'white']

# 속성 추가
p_tag['id'] = 'P-TAG'
print (p_tag['id'] ) # P-TAG

# 속성 제거
print(p_tag['align']) # center
del p_tag['align']
#print(p_tag['align']) # error

# 속성은 dictionary
print (p_tag.attrs) # {'class': ['ptag', 'white'], 'id': 'P-TAG'}


###############
# 태그의 관계
###############
html="""
<html>
<head>
<title> test web </title>
</head>
<body>
<p class="ptag black" align="center"> text contents 1 </p>
<p class="ptag yellow" align="center"> text contents 2 </p>
<p class="ptag red" align="center"> text contents 3 </p>
<img src="c:\Python34\Koala.jpg" width="500" height="300">
</body>
</html> """

bs=BeautifulSoup(html)
body_tag = bs.find('body')
print(body_tag)
'''
<body>
<p align="center" class="ptag black"> text contents 1 </p>
<p align="center" class="ptag yellow"> text contents 2 </p>
<p align="center" class="ptag red"> text contents 3 </p>
<img height="300" src="c:\Python34\Koala.jpg" width="500"/>
</body>
'''

for child in body_tag.children:
print (child)

'''
<p align="center" class="ptag black"> text contents 1 </p>


<p align="center" class="ptag yellow"> text contents 2 </p>


<p align="center" class="ptag red"> text contents 3 </p>


<img height="300" src="c:\Python34\Koala.jpg" width="500"/>

'''


img_tag = bs.find('img')
print (img_tag.parent)
''' img 부모인 body가 나옴
<body>
<p align="center" class="ptag black"> text contents 1 </p>
<p align="center" class="ptag yellow"> text contents 2 </p>
<p align="center" class="ptag red"> text contents 3 </p>
<img height="300" src="c:\Python34\Koala.jpg" width="500"/>
</body>
'''

# find_parent : 부모를 찾는 함수
print (img_tag.find_parent('body'))
print (img_tag.find_parent('html'))

html="""
<html>
<head>
<title> test web </title>
</head>
<body>
<p class="ptag black" align="center"> text contents 1 </p>
<p class="ptag yellow" align="center"> text contents 2 </p>
<p class="ptag red" align="center"> text contents 3 </p>
<img src="c:\Python34\Koala.jpg" width="500" height="300">

<div class="container">
<p class="text"> </p>
</div>
</body>
</html> """

bs=BeautifulSoup(html)

# p태그 자식을 가진 모든 부모 찾기 : find_parents
print ('************************* div example')
p_tag = bs.find('p', class_='text')
parents = p_tag.find_parents()
for parent in parents:
print (parent.name)
'''
div
body
html
[document]
'''



블로그 이미지

뚱땡이 우주인

,