A A
[Data Mining] Getting Data Part.1

Getting Data

from collections import Counter
import math, random, csv, json, re

from bs4 import BeautifulSoup
import requests

 

์˜ˆ๋ฅผ ๋“ค์–ด, beautifulsoup ๊ฐ™์€, ์–ด๋–ค ๋ชจ๋“ˆ์ด ์„ค์น˜๋˜์ง€ ์•Š์•˜๋‹ค๋ฉด? ์–ด๋–ป๊ฒŒ ํ•ด์•ผ ํ• ๊นŒ์š”?

  • googling: ์•„๋‚˜์ฝ˜๋‹ค beautifulsoup ์„ค์น˜ ๋ฐฉ๋ฒ•
  • ๊ตฌ๊ธ€ ๋‹ต๋ณ€์—์„œ ์•„๋‚˜์ฝ˜๋‹ค ํด๋ผ์šฐ๋“œ๋ฅผ ์ฐพ์œผ์„ธ์š”. ๋ชจ๋“ˆ๋“ค์ด ํ…Œ์ŠคํŠธ๋˜๊ณ  ์•ˆ์ „ํ•œ ๊ณณ์ž…๋‹ˆ๋‹ค.
  • ์ด์ œ, ์—ฌ๋Ÿฌ๋ถ„์€ ๋ถ€๋„๋Ÿฌ์šด ์ •๋„๋กœ ๋งŽ์€ ์‹œ๊ฐ„์„ acquiring(ํš๋“), cleaning(์ •๋ฆฌ), and transforming data(๋ฐ์ดํ„ฐ ๋ณ€ํ™˜)์— ํ• ์• ํ•˜๊ฒŒ ๋  ๊ฒƒ์ž…๋‹ˆ๋‹ค.

stdin and stdout

Number of lines containing numbers

์ˆซ์ž๊ฐ€ ํฌํ•จ๋œ ํŒŒ์ผ์˜ ํ–‰ ์ˆ˜๋ฅผ ์…€ ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.
sys.stdin(Keyboard) ๋ฐ sys.stdout(Monitor)์„ ์‚ฌ์šฉํ•˜์—ฌ ๋ฐ์ดํ„ฐ๋ฅผ ํŒŒ์ดํ”„ํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.
import sys, re

# sys.argv๋Š” ๋ช…๋ น์ค„ ์ธ์ˆ˜์˜ ๋ฆฌ์ŠคํŠธ์ž…๋‹ˆ๋‹ค.
# sys.argv[0]์€ ํ”„๋กœ๊ทธ๋žจ ์ž์ฒด์˜ ์ด๋ฆ„์ž…๋‹ˆ๋‹ค.
# sys.argv[1]์€ ๋ช…๋ น์ค„์—์„œ ์ง€์ •ํ•œ ์ •๊ทœ ํ‘œํ˜„์‹์ด ๋  ๊ฒƒ์ž…๋‹ˆ๋‹ค.
regex = sys.argv[1]

# ์Šคํฌ๋ฆฝํŠธ์— ์ „๋‹ฌ๋œ ๊ฐ ์ค„์— ๋Œ€ํ•ด
for line in sys.stdin:
    # ๋งŒ์•ฝ ์ •๊ทœ ํ‘œํ˜„์‹๊ณผ ์ผ์น˜ํ•˜๋Š” ๊ฒฝ์šฐ, ๊ทธ๊ฒƒ์„ stdout์— ์”๋‹ˆ๋‹ค.
    if re.search(regex, line): # Regular Expression๊ณผ ๋งž๋Š” ๋ผ์ธ
        sys.stdout.write(line)
  • ๋ช…๋ น์ค„์—์„œ ์‚ฌ์šฉ์ž๊ฐ€ ์ง€์ •ํ•œ ์ •๊ทœ ํ‘œํ˜„์‹์„ ์ด์šฉํ•˜์—ฌ ์ž…๋ ฅ์œผ๋กœ ๋ฐ›์€ ๊ฐ ์ค„์— ๋Œ€ํ•ด ๊ฒ€์ƒ‰ํ•˜๊ณ , ํ•ด๋‹น ์ •๊ทœ ํ‘œํ˜„์‹๊ณผ ์ผ์น˜ํ•˜๋Š” ๊ฒฝ์šฐ ํ•ด๋‹น ์ค„์„ ํ‘œ์ค€ ์ถœ๋ ฅ์— ์”๋‹ˆ๋‹ค.

 

import sys

count = 0
# sys.stdin์—์„œ ๊ฐ ์ค„์„ ์ฝ์–ด๋“ค์—ฌ์„œ
for line in sys.stdin:
    # count ๋ณ€์ˆ˜๋ฅผ ์ฆ๊ฐ€์‹œํ‚ต๋‹ˆ๋‹ค.
    count += 1

# print ํ•จ์ˆ˜๋Š” sys.stdout์— ์ถœ๋ ฅ๋ฉ๋‹ˆ๋‹ค.
print(count)
0

 

# Windows

!type the_bible.txt | python egrep.py "[0-9]" | python line_count.py
/bin/bash: line 1: type: the_bible.txt: not found
python3: can't open file '/content/egrep.py': [Errno 2] No such file or directory
python3: can't open file '/content/line_count.py': [Errno 2] No such file or directory

Most Common Words

์ž…๋ ฅ๋œ ๋‹จ์–ด๋ฅผ ์„ธ์–ด ๊ฐ€์žฅ ์ผ๋ฐ˜์ ์ธ ๋‹จ์–ด๋ฅผ ์ž‘์„ฑํ•˜๋Š” ์Šคํฌ๋ฆฝํŠธ ์ž…๋‹ˆ๋‹ค.
import sys
from collections import Counter

# ์ฒซ ๋ฒˆ์งธ ์ธ์ž๋กœ ๋‹จ์–ด์˜ ์ˆ˜๋ฅผ ์ „๋‹ฌํ•ฉ๋‹ˆ๋‹ค.
try:
    num_words = int(sys.argv[1]) # ๋ช…๋ น์–ด argument -> Command
except:
    print("usage: most_common_words.py num_words")
    sys.exit(1)  # ๋น„์ •์ƒ์  ์ข…๋ฃŒ ์ฝ”๋“œ๋Š” ์˜ค๋ฅ˜๋ฅผ ๋‚˜ํƒ€๋ƒ…๋‹ˆ๋‹ค.

# ํ‘œ์ค€ ์ž…๋ ฅ์—์„œ ๊ฐ ์ค„์„ ์ฝ์–ด๋“ค์—ฌ์„œ
# ์†Œ๋ฌธ์ž๋กœ ๋ณ€ํ™˜ํ•œ ๋‹จ์–ด๋ฅผ ์นด์šดํŠธํ•ฉ๋‹ˆ๋‹ค.
counter = Counter(word.lower()                        # ์†Œ๋ฌธ์ž๋กœ ๋ณ€ํ™˜๋œ ๋‹จ์–ด
                  for line in sys.stdin               # ํ‘œ์ค€ ์ž…๋ ฅ์œผ๋กœ๋ถ€ํ„ฐ์˜ ๊ฐ ์ค„
                  for word in line.strip().split()    # ๊ณต๋ฐฑ์œผ๋กœ ๋ถ„๋ฆฌ๋œ ๋‹จ์–ด๋“ค
                  if word)                            # ๋นˆ '๋‹จ์–ด'๋Š” ๊ฑด๋„ˆ๋œ๋‹ˆ๋‹ค.

# ๊ฐ€์žฅ ๋นˆ๋„๊ฐ€ ๋†’์€ ๋‹จ์–ด๋ฅผ ์ฐพ์•„ ์ถœ๋ ฅํ•ฉ๋‹ˆ๋‹ค.
for word, count in counter.most_common(num_words):
    sys.stdout.write(str(count))   # ๋นˆ๋„๋ฅผ ์ถœ๋ ฅํ•ฉ๋‹ˆ๋‹ค.
    sys.stdout.write("\\t")          # ํƒญ ๋ฌธ์ž๋กœ ๊ตฌ๋ถ„ํ•ฉ๋‹ˆ๋‹ค.
    sys.stdout.write(word)          # ๋‹จ์–ด๋ฅผ ์ถœ๋ ฅํ•ฉ๋‹ˆ๋‹ค.
    sys.stdout.write("\\n")          # ์ค„ ๋ฐ”๊ฟˆ ๋ฌธ์ž๋กœ ์ค„์„ ๋ฐ”๊ฟ‰๋‹ˆ๋‹ค.
# Windows

!type the_bible.txt | python most_common_words.py 10
64193	the
51380	and
34753	of
13643	to
12799	that
12560	in
10263	he
9840	shall
8987	unto
8836	for

Reading Files

The Basics of Text Files

# 'r'์€ ์ฝ๊ธฐ ์ „์šฉ์„ ์˜๋ฏธํ•ฉ๋‹ˆ๋‹ค.
file_for_reading = open('reading_file.txt', 'r')

# 'w'๋Š” ์“ฐ๊ธฐ๋ฅผ ์˜๋ฏธํ•ฉ๋‹ˆ๋‹ค — ์ด๋ฏธ ํŒŒ์ผ์ด ์กด์žฌํ•˜๋ฉด ํŒŒ์ผ์„ ํŒŒ๊ดดํ•ฉ๋‹ˆ๋‹ค!
file_for_writing = open('writing_file.txt', 'w')

# 'a'๋Š” ์ถ”๊ฐ€๋ฅผ ์˜๋ฏธํ•ฉ๋‹ˆ๋‹ค — ํŒŒ์ผ ๋์— ์ถ”๊ฐ€ํ•ฉ๋‹ˆ๋‹ค.
file_for_appending = open('appending_file.txt', 'a')

# ํŒŒ์ผ์„ ์‚ฌ์šฉํ•œ ํ›„์—๋Š” ํŒŒ์ผ์„ ๊ผญ ๋‹ซ์•„์•ผ ํ•ฉ๋‹ˆ๋‹ค.
file_for_writing.close() # Why? Open file ๊ฐœ์ˆ˜๊ฐ€ ์ •ํ•ด์ ธ ์žˆ๊ธฐ ๋–ผ๋ฌธ
  • ํŒŒ์ผ์„ ๋‹ซ๋Š” ๊ฒƒ์„ ์žŠ๊ธฐ ์‰ฝ๊ธฐ ๋•Œ๋ฌธ์— ํ•ญ์ƒ ๋ธ”๋ก๊ณผ ํ•จ๊ป˜ ์‚ฌ์šฉํ•ด์•ผ ํ•˜๋ฉฐ, ๋ธ”๋ก์ด ๋๋‚˜๋ฉด ์ž๋™์œผ๋กœ ๋‹ซํž™๋‹ˆ๋‹ค.
with open(filename, 'r') as f:
    # 'f'๋ฅผ ํ†ตํ•ด ๋ฐ์ดํ„ฐ๋ฅผ ๊ฐ€์ ธ์˜ค๋Š” ํ•จ์ˆ˜๋ฅผ ํ˜ธ์ถœํ•˜์—ฌ ๋ฐ์ดํ„ฐ๋ฅผ ๊ฐ€์ ธ์˜ต๋‹ˆ๋‹ค.
    data = function_that_gets_data_from(f)

# ์ด ์‹œ์ ์—์„œ 'f'๋Š” ์ด๋ฏธ ๋‹ซํ˜”์œผ๋ฏ€๋กœ ์‚ฌ์šฉํ•˜์ง€ ๋งˆ์‹ญ์‹œ์˜ค.
# ๊ฐ€์ ธ์˜จ ๋ฐ์ดํ„ฐ๋ฅผ ์ฒ˜๋ฆฌํ•ฉ๋‹ˆ๋‹ค.
process(data)
starts_with_hash = 0

# 'input.txt' ํŒŒ์ผ์„ ์ฝ๊ธฐ ๋ชจ๋“œ๋กœ ์—ฝ๋‹ˆ๋‹ค.
with open('input.txt', 'r') as f:
    # ํŒŒ์ผ์˜ ๊ฐ ์ค„์„ ํ™•์ธํ•ฉ๋‹ˆ๋‹ค.
    for line in f:                    # ํŒŒ์ผ์˜ ๊ฐ ์ค„์„ ํ™•์ธํ•ฉ๋‹ˆ๋‹ค.
        if re.match("^#", line):      # ์ •๊ทœ์‹์„ ์‚ฌ์šฉํ•˜์—ฌ ์ค„์ด '#'๋กœ ์‹œ์ž‘ํ•˜๋Š”์ง€ ํ™•์ธํ•ฉ๋‹ˆ๋‹ค.
            starts_with_hash += 1     # ๋งŒ์•ฝ ๊ทธ๋ ‡๋‹ค๋ฉด, ์นด์šดํŠธ์— 1์„ ๋”ํ•ฉ๋‹ˆ๋‹ค.

def get_domain(email_address):
    """'@'๋ฅผ ๊ธฐ์ค€์œผ๋กœ ๋ถ„ํ• ํ•˜๊ณ  ๋งˆ์ง€๋ง‰ ๋ถ€๋ถ„์„ ๋ฐ˜ํ™˜ํ•ฉ๋‹ˆ๋‹ค."""
    return email_address.lower().split("@")[-1]

# 'email_addresses.txt' ํŒŒ์ผ์„ ์ฝ๊ธฐ ๋ชจ๋“œ๋กœ ์—ฝ๋‹ˆ๋‹ค.
with open('email_addresses.txt', 'r') as f:
    # ํŒŒ์ผ์˜ ๊ฐ ์ค„์— ๋Œ€ํ•ด ๋„๋ฉ”์ธ์„ ๊ฐ€์ ธ์™€์„œ ์นด์šดํŠธํ•ฉ๋‹ˆ๋‹ค.
    # ์ฃผ์†Œ์— '@'๊ฐ€ ์žˆ๋Š” ์ค„๋งŒ ์ฒ˜๋ฆฌํ•ฉ๋‹ˆ๋‹ค.
    domain_counts = Counter(get_domain(line.strip())       # ๊ฐ ์ค„์˜ ๋„๋ฉ”์ธ์„ ๊ฐ€์ ธ์™€์„œ
                            for line in f                  # ํŒŒ์ผ์˜ ๊ฐ ์ค„์— ๋Œ€ํ•ด
                            if "@" in line)               # '@'๊ฐ€ ์žˆ๋Š” ์ค„๋งŒ ์ฒ˜๋ฆฌํ•ฉ๋‹ˆ๋‹ค.

Delimited Files

csv ํŒŒ์ผ : ์ด๋Ÿฌํ•œ ํŒŒ์ผ์€ ์‰ผํ‘œ๋กœ ๊ตฌ๋ถ„๋˜๊ฑฐ๋‚˜ ํƒญ์œผ๋กœ ๊ตฌ๋ถ„๋˜๋Š” ๊ฒฝ์šฐ๊ฐ€ ๋งŽ์Šต๋‹ˆ๋‹ค.
!type tab_delimited_stock_prices.txt
6/20/2014	AAPL	90.91
6/20/2014	MSFT	41.68
6/20/2014	FB	64.5
6/19/2014	AAPL	91.86
6/19/2014	MSFT	41.51
6/19/2014	FB	64.34
  • csv.reader๋Š” ์ค„ ๋‹จ์œ„ ํŠœํ”Œ ์ƒ์„ฑ๊ธฐ์ž…๋‹ˆ๋‹ค.

 

  • ํƒญ์œผ๋กœ ๊ตฌ๋ถ„๋œ ํ…์ŠคํŠธ ํŒŒ์ผ์„ ์ฝ์–ด๋“ค์—ฌ์„œ ๊ฐ ํ–‰์˜ ๋‚ ์งœ, ๊ธฐํ˜ธ ๋ฐ ์ข…๊ฐ€๋ฅผ ์ถ”์ถœํ•˜๊ณ  ์ถœ๋ ฅํ•ฉ๋‹ˆ๋‹ค.
  • CSV ๋ชจ๋“ˆ์˜ csv.reader() ํ•จ์ˆ˜๋ฅผ ์‚ฌ์šฉํ•˜์—ฌ ํŒŒ์ผ์„ ํƒญ์œผ๋กœ ๊ตฌ๋ถ„ํ•˜์—ฌ ์ฝ์Šต๋‹ˆ๋‹ค.
import csv

# 'tab_delimited_stock_prices.txt' ํŒŒ์ผ์„ ์ฝ๊ธฐ ๋ชจ๋“œ๋กœ ์—ฝ๋‹ˆ๋‹ค.
with open('tab_delimited_stock_prices.txt', 'r') as f:
    # ํŒŒ์ผ์„ ํƒญ์œผ๋กœ ๊ตฌ๋ถ„ํ•˜์—ฌ ์ฝ๋Š” CSV ๋ฆฌ๋”๋ฅผ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค.
    reader = csv.reader(f, delimiter='\\t')

    # CSV ํŒŒ์ผ์˜ ๊ฐ ํ–‰์„ ๋ฐ˜๋ณตํ•˜๋ฉด์„œ
    for row in reader:
        # ๊ฐ ํ–‰์˜ ์—ด์„ ์ถ”์ถœํ•ฉ๋‹ˆ๋‹ค.
        date = row[0]                    # ์ฒซ ๋ฒˆ์งธ ์—ด: ๋‚ ์งœ
        symbol = row[1]                  # ๋‘ ๋ฒˆ์งธ ์—ด: ๊ธฐํ˜ธ
        closing_price = float(row[2])    # ์„ธ ๋ฒˆ์งธ ์—ด: ์ข…๊ฐ€ (๋ถ€๋™ ์†Œ์ˆ˜์ ์œผ๋กœ ๋ณ€ํ™˜)

        # ๋‚ ์งœ, ๊ธฐํ˜ธ ๋ฐ ์ข…๊ฐ€๋ฅผ ์ถœ๋ ฅํ•ฉ๋‹ˆ๋‹ค.
        print(date, symbol, closing_price)
6/20/2014 AAPL 90.91
6/20/2014 MSFT 41.68
6/20/2014 FB 64.5
6/19/2014 AAPL 91.86
6/19/2014 MSFT 41.51
6/19/2014 FB 64.34
%%bash
cat colon_delimited_stock_prices.txt
date:symbol:closing_price
6/20/2014:AAPL:90.91
6/20/2014:MSFT:41.68
6/20/2014:FB:64.5

 

 

  • ์ฝœ๋ก ์œผ๋กœ ๊ตฌ๋ถ„๋œ ํ…์ŠคํŠธ ํŒŒ์ผ์„ ์ฝ์–ด๋“ค์—ฌ์„œ ๊ฐ ํ–‰์˜ ๋‚ ์งœ, ๊ธฐํ˜ธ ๋ฐ ์ข…๊ฐ€๋ฅผ ์ถ”์ถœํ•˜๊ณ  ์ถœ๋ ฅํ•ฉ๋‹ˆ๋‹ค.
  • CSV ๋ชจ๋“ˆ์˜ csv.DictReader() ํ•จ์ˆ˜๋ฅผ ์‚ฌ์šฉํ•˜์—ฌ ํŒŒ์ผ์„ ์ฝ๊ณ , ๊ฐ ํ–‰์„ ๋”•์…”๋„ˆ๋ฆฌ๋กœ ๋‚˜ํƒ€๋ƒ…๋‹ˆ๋‹ค.
import csv

# 'colon_delimited_stock_prices.txt' ํŒŒ์ผ์„ ์ฝ๊ธฐ ๋ชจ๋“œ๋กœ ์—ฝ๋‹ˆ๋‹ค.
with open('colon_delimited_stock_prices.txt', 'r') as f:
    # ํŒŒ์ผ์„ ์ฝœ๋ก ์œผ๋กœ ๊ตฌ๋ถ„ํ•˜์—ฌ ์ฝ๋Š” CSV ๋”•์…”๋„ˆ๋ฆฌ ๋ฆฌ๋”๋ฅผ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค.
    reader = csv.DictReader(f, delimiter=':')

    # CSV ํŒŒ์ผ์˜ ๊ฐ ํ–‰์„ ๋ฐ˜๋ณตํ•˜๋ฉด์„œ
    for row in reader:
        # ๊ฐ ํ–‰์˜ ํ•„๋“œ ๊ฐ’์„ ๋”•์…”๋„ˆ๋ฆฌ์—์„œ ์ถ”์ถœํ•ฉ๋‹ˆ๋‹ค.
        date = row["date"]                      # 'date' ํ•„๋“œ
        symbol = row["symbol"]                  # 'symbol' ํ•„๋“œ
        closing_price = float(row["closing_price"])  # 'closing_price' ํ•„๋“œ (๋ถ€๋™ ์†Œ์ˆ˜์ ์œผ๋กœ ๋ณ€ํ™˜)

        # ๋‚ ์งœ, ๊ธฐํ˜ธ ๋ฐ ์ข…๊ฐ€๋ฅผ ์ถœ๋ ฅํ•ฉ๋‹ˆ๋‹ค.
        print(date, symbol, closing_price)
6/20/2014 AAPL 90.91
6/20/2014 MSFT 41.68
6/20/2014 FB 64.5
%%bash
cat comma_delimited_stock_prices.txt
FB,64.5
MSFT,41.68
AAPL,90.91

 

  • ์˜ค๋Š˜์˜ ์ฃผ๊ฐ€๋ฅผ ๋‚˜ํƒ€๋‚ด๋Š” ๋”•์…”๋„ˆ๋ฆฌ๋ฅผ ์ฝค๋งˆ๋กœ ๊ตฌ๋ถ„๋œ ํ…์ŠคํŠธ ํŒŒ์ผ์— ์“ฐ๋Š” ์˜ˆ์ œ์ž…๋‹ˆ๋‹ค.
  • CSV ๋ชจ๋“ˆ์˜ csv.writer() ํ•จ์ˆ˜๋ฅผ ์‚ฌ์šฉํ•˜์—ฌ ํŒŒ์ผ์„ ์“ฐ๊ณ , ๊ฐ ์ฃผ์‹๊ณผ ๊ฐ€๊ฒฉ์„ ๋ฆฌ์ŠคํŠธ๋กœ ๋งŒ๋“ค์–ด์„œ ํŒŒ์ผ์— ์”๋‹ˆ๋‹ค.
import csv

# ์˜ค๋Š˜์˜ ์ฃผ๊ฐ€๋ฅผ ๋‚˜ํƒ€๋‚ด๋Š” ๋”•์…”๋„ˆ๋ฆฌ์ž…๋‹ˆ๋‹ค.
today_prices = {'AAPL': 90.91, 'MSFT': 41.68, 'FB': 64.5}

# 'comma_delimited_stock_prices_1.txt' ํŒŒ์ผ์„ ์“ฐ๊ธฐ ๋ชจ๋“œ๋กœ ์—ฝ๋‹ˆ๋‹ค.
with open('comma_delimited_stock_prices_1.txt', 'w') as f:
    # ํŒŒ์ผ์„ ์ฝค๋งˆ๋กœ ๊ตฌ๋ถ„ํ•˜์—ฌ ์“ฐ๋Š” CSV ๋ผ์ดํ„ฐ๋ฅผ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค.
    writer = csv.writer(f, delimiter=',')

    # ์˜ค๋Š˜์˜ ์ฃผ๊ฐ€ ๋”•์…”๋„ˆ๋ฆฌ์—์„œ ๊ฐ ์ฃผ์‹๊ณผ ๊ฐ€๊ฒฉ์„ ๊ฐ€์ ธ์™€์„œ ํŒŒ์ผ์— ์“ฐ๊ธฐํ•ฉ๋‹ˆ๋‹ค.
    for stock, price in today_prices.items():
        # ๊ฐ ์ฃผ์‹๊ณผ ๊ฐ€๊ฒฉ์„ ๋ฆฌ์ŠคํŠธ๋กœ ๋งŒ๋“ค์–ด์„œ ์“ฐ๊ธฐํ•ฉ๋‹ˆ๋‹ค.
        writer.writerow([stock, price])
%%bash
cat comma_delimited_stock_prices_1.txt
AAPL,90.91
MSFT,41.68
FB,64.5

 

results = [["test1", "success", "Monday"],
           ["test2", "success, kind of", "Tuesday"],
           ["test3", "failure, kind of", "Wednesday"],
           ["test4", "failure, utter", "Thursday"]]

# don't do this!
with open('bad_csv.txt', 'w') as f:
    for row in results:
        f.write(",".join(map(str, row))) # might have too many commas in it!
        f.write("\\n")                    # row might have newlines as well!
%%bash
cat bad_csv.txt
test1,success,Monday
test2,success, kind of,Tuesday
test3,failure, kind of,Wednesday
test4,failure, utter,Thursday

Scraping the Web

๋ฐ์ดํ„ฐ๋ฅผ ์–ป๋Š” ๋˜ ๋‹ค๋ฅธ ๋ฐฉ๋ฒ•์€ ์›น ํŽ˜์ด์ง€์—์„œ ์Šคํฌ๋žฉํ•˜๋Š” ๊ฒƒ์ž…๋‹ˆ๋‹ค

 

HTML & Parsing

<html>
  <head>
    <title>A web page</title>
  </head>
  <body>
    <p id="author">Joel Grus</p>
    <p id="subject">Data Science</p>
  </body>
</html>
  • for python3:
!pip install html5lib
!pip install beautifulsoup4
  • for anaconda
    •  
conda install -c anaconda html5lib
conda install -c anaconda beautifulsoup4

 

 

Html5Lib :: Anaconda.org

Description html5lib is a pure-python library for parsing HTML. It is designed to conform to the WHATWG HTML specification, as is implemented by all major web browsers.

anaconda.org

 

 

Login :: Anaconda.org

Sign in to Anaconda.org I forgot my password. I forgot my username. Register for an account.

anaconda.org


DOM Lesson One

  • Hello World! - John | Doe | Alaska
from bs4 import BeautifulSoup

html = """
<html>
  <head>
    <title>A web page</title>
  </head>
  <body>
    <p id="author">Joel Grus</p>
    <p id="subject">Data Science</p>
    <p class="price">30</p>
  </body>
</html>"""
soup = BeautifulSoup(html, 'html5lib')
์ค‘์š”: Tag, Attribute, Text
soup = BeautifulSoup(html, 'html5lib')→ Parsing ํ•ด์„œ ์ž๋ฃŒ๊ตฌ์กฐํ™” ํ•ฉ๋‹ˆ๋‹ค.(Dict ํ™”)

 

Query 1: Find title (์ œ๋ชฉ ์ฐพ๊ธฐ)

soup.title.text

# 'A web page'

 

Query 2: Find title's text (์ œ๋ชฉ ํ…์ŠคํŠธ ์ฐพ๊ธฐ)

soup.title.text

# 'A web page'

 

Query 3: Find p of body (p์˜ ๋ณธ๋ฌธ ์ฐพ๊ธฐ)

soup.body.p

# <p id="author">Joel Grus</p>

 

Query 4: Find all p under body (p์•„๋ž˜ ๋ณธ๋ฌธ ์ฐพ๊ธฐ)

soup.body('p')
[<p id="author">Joel Grus</p>,
 <p id="subject">Data Science</p>,
 <p class="price">30</p>]

 

Query 5: Find second p's text of body (๋‘ ๋ฒˆ์งธ p์˜ ๋ณธ๋ฌธ ์ฐพ๊ธฐ)

soup.body('p')[1].text

# 'Data Science'

 

Query 6: Find last p of body (p์˜ ๋งˆ์ง€๋ง‰ ๋ถ€๋ถ„ ์ฐพ๊ธฐ)

soup.body('p')[-1]

# <p class="price">30</p>

 

Query 7: Loop over all p of body (p์˜ ๋ชจ๋“  ๋ถ€๋ถ„์— ๋ฃจํ”„๋ฅผ ์”Œ์šฐ๊ธฐ)

for i, p in enumerate(soup.body('p')):
    print('paragraph {}: {}'.format(i, p.text))
paragraph 0: Joel Grus
paragraph 1: Data Science
paragraph 2: 30

 

Query 8: Find first p's id attribute's value (์ฒซ ๋ฒˆ์งธ p์˜ ID ์†์„ฑ ๊ฐ’ ์ฐพ๊ธฐ)

soup.p['id']

# 'author'

 

Query 9: Find all p whose attribute id is 'author' (์†์„ฑ ID๊ฐ€ '์ €์ž'์ธ ๋ชจ๋“  p ์ฐพ๊ธฐ)

soup('p', {'id':'author'})

# [<p id="author">Joel Grus</p>]

 

Query 10: Find all p whose attribute class is 'price' (์†์„ฑ ํด๋ž˜์Šค๊ฐ€ '๊ฐ€๊ฒฉ'์ธ ๋ชจ๋“  p ์ฐพ๊ธฐ)

soup('p', 'price')
#soup('p', {'class':'price'})
[<p class="price">30</p>]

 

Query 11: Find all texts (๋ชจ๋“  ํ…์ŠคํŠธ ์ฐพ๊ธฐ)

soup.text # List

# '\\n    A web page\\n  \\n  \\n    Joel Grus\\n    Data Science\\n    30\\n  \\n'

 

first_paragraph = soup.find('p')     # ๋˜๋Š” soup.p
print(first_paragraph)               # ์ฒซ ๋ฒˆ์งธ <p> ํƒœ๊ทธ์˜ ๋‚ด์šฉ์„ ์ถœ๋ ฅํ•ฉ๋‹ˆ๋‹ค.
print(type(first_paragraph))         # ๊ฒฐ๊ณผ์˜ ํƒ€์ž…์„ ์ถœ๋ ฅํ•ฉ๋‹ˆ๋‹ค.
<p id="author">Joel Grus</p>
<class 'bs4.element.Tag'>
  • ์ฒซ ๋ฒˆ์งธ <p> ํƒœ๊ทธ์˜ ๋‚ด์šฉ์ด ๋“ค์–ด ์žˆ์„ ๊ฒƒ์ด๋ฉฐ, <p> ํƒœ๊ทธ ๋‚ด์šฉ์˜ ํƒ€์ž…์€ BeautifulSoup์˜ ํŠน์ˆ˜ํ•œ ํƒ€์ž…์ธ Tag ์ž…๋‹ˆ๋‹ค.
first_paragraph_text = soup.p.text
first_paragraph_text

# 'Joel Grus'
first_paragraph_words = soup.p.text.split()
first_paragraph_words

# ['Joel', 'Grus']
  • ์ฒซ ๋ฒˆ์งธ <p> ํƒœ๊ทธ์˜ ํ…์ŠคํŠธ ๋‚ด์šฉ์„ ์ถ”์ถœํ•œ ํ›„, .split() ๋ฉ”์„œ๋“œ๋ฅผ ์‚ฌ์šฉํ•˜์—ฌ ๊ณต๋ฐฑ์„ ๊ธฐ์ค€์œผ๋กœ ๋‹จ์–ด ๋‹จ์œ„๋กœ ๋ถ„ํ• ํ•ฉ๋‹ˆ๋‹ค.
first_paragraph_id = soup.p['id']        # 'id' ์†์„ฑ์ด ์—†์œผ๋ฉด KeyError๋ฅผ ๋ฐœ์ƒ์‹œํ‚ต๋‹ˆ๋‹ค.
first_paragraph_id
#type(soup.p)

# Result: 'author'
  • soup์—์„œ ์ฒซ ๋ฒˆ์งธ <p> ํƒœ๊ทธ์˜ id ์†์„ฑ ๊ฐ’์„ ๊ฐ€์ ธ์™€์„œ first_paragraph_id ๋ณ€์ˆ˜์— ํ• ๋‹นํ•ฉ๋‹ˆ๋‹ค.
  • ๋งŒ์•ฝ ํ•ด๋‹น ํƒœ๊ทธ์— id ์†์„ฑ์ด ์—†๋‹ค๋ฉด KeyError๊ฐ€ ๋ฐœ์ƒํ•ฉ๋‹ˆ๋‹ค.
first_paragraph_id2 = soup.p.get('id')   # 'id' ์†์„ฑ์ด ์—†์œผ๋ฉด None์„ ๋ฐ˜ํ™˜ํ•ฉ๋‹ˆ๋‹ค.
print(first_paragraph_id2)

# Result: 'author'
  • soup์—์„œ ์ฒซ ๋ฒˆ์งธ <p> ํƒœ๊ทธ์˜ id ์†์„ฑ ๊ฐ’์„ ๊ฐ€์ ธ์™€์„œ first_paragraph_id2 ๋ณ€์ˆ˜์— ํ• ๋‹นํ•ฉ๋‹ˆ๋‹ค.
  • ๋งŒ์•ฝ ํ•ด๋‹น ํƒœ๊ทธ์— id ์†์„ฑ์ด ์—†๋‹ค๋ฉด None์„ ๋ฐ˜ํ™˜ํ•ฉ๋‹ˆ๋‹ค.
all_paragraphs = soup.find_all('p')      # ๋˜๋Š” soup('p')๋กœ๋„ ๊ฐ€๋Šฅํ•ฉ๋‹ˆ๋‹ค.
print(all_paragraphs)
[<p id="author">Joel Grus</p>, <p id="subject">Data Science</p>]
  • soup์—์„œ ๋ชจ๋“  <p> ํƒœ๊ทธ๋ฅผ ์ฐพ์•„์„œ all_paragraphs ๋ณ€์ˆ˜์— ํ• ๋‹นํ•ฉ๋‹ˆ๋‹ค.
  • ๊ทธ ๊ฒฐ๊ณผ๋Š” ๋ฆฌ์ŠคํŠธ ํ˜•ํƒœ๋กœ ๋ฐ˜ํ™˜๋ฉ๋‹ˆ๋‹ค.
soup('p')

# [<p id="author">Joel Grus</p>, <p id="subject">Data Science</p>]
soup('p', {'id':'subject'})

# [<p id="subject">Data Science</p>]
  • id ์†์„ฑ์ด 'subject'์ธ ๋ชจ๋“  <p> ํƒœ๊ทธ๋ฅผ ์ฐพ๋Š” ์˜ˆ์‹œ์ž…๋‹ˆ๋‹ค.
  • ๊ฒฐ๊ณผ๋Š” ํ•ด๋‹น ์กฐ๊ฑด์— ๋งž๋Š” <p> ํƒœ๊ทธ๋“ค์„ ํฌํ•จํ•˜๋Š” ๋ฆฌ์ŠคํŠธ๋กœ ๋ฐ˜ํ™˜ํ•ฉ๋‹ˆ๋‹ค.
  • soup์—์„œ id ์†์„ฑ์ด 'subject' ์ธ ๋ชจ๋“  <p> ํƒœ๊ทธ๋ฅผ ์ฐพ์•„ ๋ฐ˜ํ™˜ํ•ฉ๋‹ˆ๋‹ค.
  • ๋‘ ๋ฒˆ์งธ ์ธ์ž๋กœ๋Š” ๋”•์…”๋„ˆ๋ฆฌ๋ฅผ ์‚ฌ์šฉํ•˜์—ฌ ์›ํ•˜๋Š” ์†์„ฑ๊ณผ ๊ทธ ๊ฐ’์˜ ์กฐ๊ฑด์„ ์ง€์ •ํ•ฉ๋‹ˆ๋‹ค.

 

paragraphs_with_ids = [p for p in soup('p') if p.get('id')]
paragraphs_with_ids
[<p id="author">Joel Grus</p>, <p id="subject">Data Science</p>]

 

  • BeautifulSoup ๊ฐ์ฒด soup์—์„œ ๋ชจ๋“  <p> ํƒœ๊ทธ๋ฅผ ์ฐพ์€ ํ›„, ๋ฆฌ์ŠคํŠธ ์ปดํ”„๋ฆฌํ—จ์…˜์„ ์‚ฌ์šฉํ•˜์—ฌ id ์†์„ฑ์„ ๊ฐ€์ง„ ํƒœ๊ทธ๋“ค๋งŒ ๋ชจ์•„์„œ paragraphs_with_ids ๋ฆฌ์ŠคํŠธ์— ์ €์žฅํ•ฉ๋‹ˆ๋‹ค.
  • ๊ฒฐ๊ณผ๋Š” id ์†์„ฑ์„ ๊ฐ€์ง„ <p> ํƒœ๊ทธ๋“ค์˜ ๋ฆฌ์ŠคํŠธ๋กœ ๋ฐ˜ํ™˜๋ฉ๋‹ˆ๋‹ค.
important_paragraphs = soup('p', {'class': 'important'})

# [<p id="author">Joel Grus</p>, <p id="subject">Data Science</p>]
  • soup์—์„œ ํด๋ž˜์Šค๊ฐ€ 'important'์ธ ๋ชจ๋“  <p> ํƒœ๊ทธ๋ฅผ ์ฐพ์•„ ๋ฐ˜ํ™˜ํ•ฉ๋‹ˆ๋‹ค.
  • ๋‘ ๋ฒˆ์งธ ์ธ์ž๋กœ๋Š” ๋”•์…”๋„ˆ๋ฆฌ๋ฅผ ์‚ฌ์šฉํ•˜์—ฌ ์›ํ•˜๋Š” ํด๋ž˜์Šค์™€ ๊ทธ ๊ฐ’์˜ ์กฐ๊ฑด์„ ์ง€์ •ํ•ฉ๋‹ˆ๋‹ค.

 

# ๋„ค์ด๋ฒ„ ํ™ˆํŽ˜์ด์ง€์˜ HTML์„ ๊ฐ€์ ธ์˜ต๋‹ˆ๋‹ค.
html = requests.get("<http://www.naver.com>").text

# HTML์„ BeautifulSoup์œผ๋กœ ํŒŒ์‹ฑํ•ฉ๋‹ˆ๋‹ค.
soup = BeautifulSoup(html, 'html5lib')
  • requests ๋ชจ๋“ˆ์„ ์‚ฌ์šฉํ•˜์—ฌ ๋„ค์ด๋ฒ„ ํ™ˆํŽ˜์ด์ง€์— GET ์š”์ฒญ์„ ๋ณด๋‚ด๊ณ , ํ•ด๋‹น ํŽ˜์ด์ง€์˜ HTML์„ ๊ฐ€์ ธ์˜ต๋‹ˆ๋‹ค.
  • ๊ทธ ํ›„, BeautifulSoup์„ ์‚ฌ์šฉํ•˜์—ฌ HTML์„ ํŒŒ์‹ฑํ•˜์—ฌ ๊ฐ์ฒด๋กœ ์ €์žฅํ•ฉ๋‹ˆ๋‹ค.
  • ์ด๋ ‡๊ฒŒ ํ•˜๋ฉด ์›น ํŽ˜์ด์ง€์˜ ๊ตฌ์กฐ๋ฅผ ํƒ์ƒ‰ํ•˜๊ณ  ์›ํ•˜๋Š” ์ •๋ณด๋ฅผ ์ถ”์ถœํ•ฉ๋‹ˆ๋‹ค.

 

# ๊ฒฝ๊ณ : ๋งŒ์•ฝ <span>์ด ์—ฌ๋Ÿฌ ๊ฐœ์˜ <div> ์•ˆ์— ์žˆ๋Š” ๊ฒฝ์šฐ, ๊ฐ™์€ <span>์„ ์—ฌ๋Ÿฌ ๋ฒˆ ๋ฐ˜ํ™˜ํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.
# ๋งŒ์•ฝ ๊ทธ๋ ‡๋‹ค๋ฉด ์ข€ ๋” ๋˜‘๋˜‘ํ•˜๊ฒŒ ์ฒ˜๋ฆฌํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค.

spans_inside_divs = [span                     # <span>์„ ๊ฐ๊ฐ ๋ฆฌ์ŠคํŠธ์— ์ถ”๊ฐ€ํ•ฉ๋‹ˆ๋‹ค.
                     for div in soup('div')   # ํŽ˜์ด์ง€์˜ ๊ฐ <div>์— ๋Œ€ํ•ด
                     for span in div('span')] # ๊ทธ ์•ˆ์— ์žˆ๋Š” ๊ฐ <span>์„ ์ฐพ์Šต๋‹ˆ๋‹ค.
  • soup์—์„œ ๋ชจ๋“  <div> ํƒœ๊ทธ๋ฅผ ์ฐพ์€ ํ›„, ๊ฐ <div> ํƒœ๊ทธ ์•ˆ์— ์žˆ๋Š” ๋ชจ๋“  <span> ํƒœ๊ทธ๋ฅผ ์ฐพ์•„ ๋ฆฌ์ŠคํŠธ์— ์ถ”๊ฐ€ํ•ฉ๋‹ˆ๋‹ค.
  • ๊ทธ๋Ÿฌ๋‚˜ ๋งŒ์•ฝ <span> ํƒœ๊ทธ๊ฐ€ ์—ฌ๋Ÿฌ ๊ฐœ์˜ <div> ํƒœ๊ทธ ์•ˆ์— ์กด์žฌํ•˜๋Š” ๊ฒฝ์šฐ, ๋™์ผํ•œ <span>์„ ์—ฌ๋Ÿฌ ๋ฒˆ ๋ฐ˜ํ™˜ํ•ฉ๋‹ˆ๋‹ค.
spans_inside_divs
  • spans_inside_divs ๋ณ€์ˆ˜์—๋Š” ๋ชจ๋“  <div> ํƒœ๊ทธ ์•ˆ์— ์žˆ๋Š” ๋ชจ๋“  <span> ํƒœ๊ทธ๊ฐ€ ํฌํ•จ๋œ ๋ฆฌ์ŠคํŠธ๊ฐ€ ์ €์žฅ๋˜์–ด ์žˆ์„ ๊ฒƒ์ž…๋‹ˆ๋‹ค.
  • ์ด ๋ฆฌ์ŠคํŠธ๋Š” ๊ฐ <div> ํƒœ๊ทธ์— ๋Œ€ํ•ด ๊ทธ ์•ˆ์— ์žˆ๋Š” ๋ชจ๋“  <span> ํƒœ๊ทธ๋ฅผ ํฌํ•จํ•˜๊ณ  ์žˆ์Šต๋‹ˆ๋‹ค.
  • ์ด ๋ฆฌ์ŠคํŠธ๋ฅผ ์ถœ๋ ฅํ•˜๋ฉด ํ•ด๋‹น ์ •๋ณด๋ฅผ ํ™•์ธํ•ฉ๋‹ˆ๋‹ค.