we/crawl.js at master · CoderMuppet/we · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import { JSDOM } from 'jsdom'

function normalizeURL(url) {
    const urlObj = new URL(url)
    let fullPath = `${urlObj.host}${urlObj.pathname}`
    if (fullPath.slice(-1) === '/') {
      fullPath = fullPath.slice(0, -1)
    }
    return fullPath
  }

  function getURLsFromHTML(html, baseURL) {
    const urls = []
    const dom = new JSDOM(html)
    const anchors = dom.window.document.querySelectorAll('a')

    for (const anchor of anchors) {
      if (anchor.hasAttribute('href')) {
        let href = anchor.getAttribute('href')

        try {
          // convert any relative URLs to absolute URLs
          href = new URL(href, baseURL).href
          urls.push(href)
        } catch(err) {
          console.log(`${err.message}: ${href}`)
        }
      }
    }

    return urls
  }

  async function fetchHTML(url) {
    let res
    try {
      res = await fetch(url)
    } catch (err) {
      throw new Error(`Got Network error: ${err.message}`)
    }

    if (res.status > 399) {
      throw new Error(`Got HTTP error: ${res.status} ${res.statusText}`)
    }

    const contentType = res.headers.get('content-type')
    if (!contentType || !contentType.includes('text/html')) {
      throw new Error(`Got non-HTML response: ${contentType}`)
    }

    return res.text()
  }

  // use default args to prime the first call
  async function crawlPage(baseURL, currentURL = baseURL, pages = {}) {
    // if this is an offsite URL, bail immediately
    const currentURLObj = new URL(currentURL)
    const baseURLObj = new URL(baseURL)
    if (currentURLObj.hostname !== baseURLObj.hostname) {
      return pages
    }

    // use a consistent URL format
    const normalizedURL = normalizeURL(currentURL)

    // if we've already visited this page
    // just increase the count and don't repeat
    // the http request
    if (pages[normalizedURL] > 0) {
      pages[normalizedURL]++
      return pages
    }

    // initialize this page in the map
    // since it doesn't exist yet
    pages[normalizedURL] = 1

    // fetch and parse the html of the currentURL
    console.log(`crawling ${currentURL}`)
    let html = ''
    try {
      html = await fetchHTML(currentURL)
    } catch (err) {
      console.log(`${err.message}`)
      return pages
    }

    // recur through the page's links
    const nextURLs = getURLsFromHTML(html, baseURL)
    for (const nextURL of nextURLs) {
      pages = await crawlPage(baseURL, nextURL, pages)
    }

    return pages
  }

  export { normalizeURL, getURLsFromHTML, crawlPage }