Python
[Python] - 웹 크롤링 Parsing/Download (beautifulsoup4 예제)
1984
2022. 9. 1. 16:33
import requests
import os
from bs4 import BeautifulSoup
from urllib import request
from urllib.request import urlopen
from urllib.parse import quote_plus
from urllib.error import HTTPError
# 1) url 오픈을 위한 패키지
# 2) beautifulsoup4를 사용하기 위한 패키지
# 3) url을 구성요소로 구문 분석하기 위한 패키지
url = "https://www.atlassian.com/software/confluence/download-archives"
site = "https://www.atlassian.com"
rec = "/software/confluence/downloads/binary/"
file = open("C:/test/html.txt", "r", encoding='UTF-8')
overlap = []
s = set()
def get_download(url, fname, directory):
try:
os.chdir(directory)
request.urlretrieve(url,fname)
print('다운로드 완료\n')
except HTTPError as e:
print('error')
return
def main():
soup = BeautifulSoup(file, "html.parser")
getA = soup.find_all('a', "product-versions accordion")
for getLink in getA :
data = getLink.get("data-version")
s.add(data)
for data in s:
# https://www.atlassian.com/software/confluence/downloads/binary/atlassian-confluence-7.19.1-x64.exe
downloadURL = "https://www.atlassian.com/software/confluence/downloads/binary/atlassian-confluence-" + data + "-x64.exe"
downloadFileName = "atlassian-confluence-" + data + "-x64.exe"
downloadDir = "C:/workspace/" + str(data).split('.')[0]
get_download(downloadURL, downloadFileName, downloadDir)
print(s)
if __name__ == "__main__":
main()
728x90