Lunch And Learn

node v0.12.18
version: 2.0.1
endpointsharetweet
This is about as basic a crawler as you can get. Axios is a HTTP request library which behaves like the WHATWG standard Fetch (https://developer.mozilla.org/en-US/docs/Web/API/Fetch_API), but works in both the client and the server. Cheerio brings jQuery-like DOM manipulation to the server.
var axios = require('axios'); // Docs: https://github.com/mzabriskie/axios var cheerio = require('cheerio'); // Docs: https://github.com/cheeriojs/cheerio // The page to crawl var ARTICLE_URL = 'http://www.buzzfeed.com/expresident/best-cat-pictures'; // The selectors you'll use var ARTICLE_TITLE_SELECTOR = '#post-title'; var LIST_ITEM_SELECTOR = '.buzz_superlist_item_image'; var ITEM_TITLE_SELECTOR = '.subbuzz_name'; var ITEM_IMAGE_SELECTOR = 'img.bf_dom'; var ITEM_IMAGE_ATTRIBUTE = 'rel:bf_image_src'; // Request a page and return a Promise for a loaded Cheerio object function urlToCheerio (url) { return axios.get(url) .then(result => cheerio.load(result.data) ) .catch(e => console.log('Crawl error', e) ); } function extractListItem ($, el) { var $item = $(el); return { title: $item.find(ITEM_TITLE_SELECTOR).text(), image: $item.find(ITEM_IMAGE_SELECTOR).attr(ITEM_IMAGE_ATTRIBUTE) }; } function processBuzzfeedPage ($) { return { title: $(ARTICLE_TITLE_SELECTOR).text(), slides: $(LIST_ITEM_SELECTOR) .map( (i, el) => extractListItem($, el) ) .toArray() }; } var article = await urlToCheerio(ARTICLE_URL) .then(processBuzzfeedPage);
You should now have an array of 100 URLs of cat photos. To see them all we are going to turn them into React elements, which Tonic can helpfully display below.
var React = require('react'); // This adds HTML tags like <img> into the global scope so that you can make HTML // Turn the URLs into React elements var SlideshowImage = React.createClass({ displayName: 'SlideshowImage', getInitialState: function() { return { isHovering: false }; }, makeImageStyles: function () { return { width: 400, height: 400, margin: 5, display: 'inline-block', backgroundSize: 'cover', backgroundImage: 'url(' + this.props.image + ')', position: 'relative' }; }, render: function() { var titleStyles = { color: '#fff', position: 'absolute', textAlign: 'center', top: 5, textShadow: '0 0 3px rgba(0,0,0,0.3)', width: '100%' }; return ( <div style={this.makeImageStyles()} > <h3 style={titleStyles}>{this.props.title.split('.')[1]}</h3> </div> ); } }); // React forces you to always enclose arrays of HTML elements in a parent element. var BuzzfeedArticle = React.createClass({ displayName: 'Article', processItem: function (item) { return <SlideshowImage title={item.title} image={item.image} /> }, render: function() { return ( <div> <h1>{this.props.title}</h1> {this.props.items.map(this.processItem)} </div> ); } }); var finalArticle = <BuzzfeedArticle title={article.title} items={article.slides} />; React.renderToString(finalArticle);
Loading…

no comments

    sign in to comment