01_fetch.py - Dissociated-Parse (master) - git @ Cat's Eye Technologies

Tree @master (Download .tar.gz)

generated
LICENSES
.gitignore
01_fetch.py
02_scrape.py
03_sentencify.py
04_parse.py
05_build.py
06_traverse.py
cleanup.py
linktree.py
README.md
requirements.txt

01_fetch.py @master — raw · history · blame

#!/usr/bin/env python3

# Copyright (c) 2021-2024 Chris Pressey, Cat's Eye Technologies
# This file is distributed under the MIT license.  For more information, see
# the file LicenseRef-MIT-X-Dissociated-Parse.txt in the LICENSES directory.
# SPDX-License-Identifier: LicenseRef-MIT-X-Dissociated-Parse

from time import sleep
import requests
import os

WWOZ = ['https://en.wikisource.org/wiki/The_Wonderful_Wizard_of_Oz/Chapter_{}'.format(c) for c in range(1, 25)]
PAGES = '\n'.join(WWOZ)


def main():
    print('Start!')
    for url in PAGES.split():
        dest = os.path.join('download', url.replace(':', '_').replace('/', '_'))
        print(url, '-->', dest)
        if os.path.exists(dest):
            print(dest, 'already exists, skipping')
            continue
        sleep(10)
        print('Fetching...')
        r = requests.get(url)
        with open(dest, 'w') as f:
            f.write(r.text)

main()