From 0bdf6ac5ea2498640eff795838a7c3e8f3a8ac24 Mon Sep 17 00:00:00 2001 From: windlejacob12 Date: Sat, 3 Sep 2022 21:33:48 -0400 Subject: [PATCH] More link functions --- src/com_jakewindle_git/scraper.clj | 33 +++++++++++++++++------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/src/com_jakewindle_git/scraper.clj b/src/com_jakewindle_git/scraper.clj index 6728b5a..9279242 100644 --- a/src/com_jakewindle_git/scraper.clj +++ b/src/com_jakewindle_git/scraper.clj @@ -1,25 +1,30 @@ (ns com-jakewindle-git.scraper - (:gen-class) (:require [clj-http.client :as client] - [hickory.core :as hick])) + [hickory.core :as hick] + [hickory.select :as s])) ;; Models +(def stat-urls ["https://www.espn.com/nfl/stats/team/_/season/2011/seasontype/2"]) -;; Page -(defn page [html] - {:html html :parsed nil}) - -(defn get-src [uri] - (:body (client/get uri))) - -(defn parse-src [src] - (hick/parse src)) +(defn not-nil [v] + (not (nil? v))) (defn new-page [uri] (-> uri - (get-src) - (parse-src) - (page))) + (client/get) + :body + hick/parse + hick/as-hickory)) + +(defn get-links [parsed] + (-> (s/select (s/child (s/tag "a")) + parsed))) + +(defn links-to-text [links] + (filter not-nil (map #(-> % :attrs :href) links))) + +(defn init-crawler [] + (map #(new-page %) stat-urls)) ;; Entities (defn greet