Would you like to clone this notebook?

When you clone a notebook you are able to make changes without affecting the original notebook.

Cancel

Scrape Instagram Page's popular Images

node v6.17.1
version: 2.0.0
endpointsharetweet
The below code scrapes popular Images from an Instagram Page. Example: For given a URL 'https://www.instagram.com/dress_blouse_designer/' one may call function ScrapeInstagramPage ({username : "dress_blouse_designer", debug : false}); The function 'ScrapeInstagramPage' takes care of post ageing effect.
var request = require('parse5'); var request = require('request'); var rp = require('request-promise'); var $ = require('cheerio'); // Basically jQuery for node.js const jsdom = require("jsdom"); const { JSDOM } = jsdom; function ScrapeInstagramPage (args) { dout("ScrapeInstagramPage for username -> " + args.username); var query_url = 'https://www.instagram.com/' + args.username + '/'; var cookieString = ''; var options = { url: query_url, method: 'GET', headers: { 'x-requested-with' : 'XMLHttpRequest', 'accept-language' : 'en-US,en;q=0.8,pt;q=0.6,hi;q=0.4', 'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36', 'referer' : 'https://www.instagram.com/dress_blouse_designer/', 'Cookie' : cookieString, 'Accept' : '*/*', 'Connection' : 'keep-alive', 'authority' : 'www.instagram.com' } }; function dout (msg) { if (args.debug) { console.log(msg); } } function autoParse(body, response, resolveWithFullResponse) { // FIXME: The content type string could contain additional values like the charset. // Consider using the `content-type` library for a robust comparison. if (response.headers['content-type'] === 'application/json') { return JSON.parse(body); } else if (response.headers['content-type'] === 'text/html') { return $.load(body); } else { return body; } } options.transform = autoParse; rp(options) .then(function (autoParsedBody) { if (args.debug) { console.log("Responce of 'Get first user page': "); console.log(autoParsedBody); console.log("Creating JSDOM from above Responce..."); } const dom = new JSDOM(autoParsedBody.html(), { runScripts: "dangerously" }); if (args.debug) console.log(dom.window._sharedData); // full data doc form instagram for a page var user = dom.window._sharedData.entry_data.ProfilePage[0].user; if (args.debug) { console.log(user); // page user console.log(user.id); // user ID console.log(user.full_name); // user full_name console.log(user.username); // user username console.log(user.followed_by.count); // user followed_by console.log(user.profile_pic_url_hd); // user profile pic console.log(autoParsedBody.html()); } if (user.is_private) { dout ("User account is PRIVATE"); } else { dout ("User account is public"); GetPostsFromUser(user.id, 5000, undefined); } }) .catch(function (err) { console.log( "ERROR: " + err ); }); var pop_posts = []; function GetPostsFromUser (user_id, first, end_cursor) { var end_cursor_str = ""; if (end_cursor != undefined) { end_cursor_str = '&after=' + end_cursor; } options.url = 'https://www.instagram.com/graphql/query/?query_id=17880160963012870&id=' + user_id + '&first=' + first + end_cursor_str; rp(options) .then(function (autoParsedBody) { if (autoParsedBody.status === "ok") { if (args.debug) console.log(autoParsedBody.data); var posts = autoParsedBody.data.user.edge_owner_to_timeline_media; // POSTS processing if (posts.edges.length > 0) { //console.log(posts.edges); pop_posts = pop_posts.concat (posts.edges.map(function(e) { var d = new Date(); var now_seconds = d.getTime() / 1000; var seconds_since_post = now_seconds - e.node.taken_at_timestamp; //console.log("seconds_since_post: " + seconds_since_post); var ageing = 10; // valuses (1-10]; big value means no ageing var days_since_post = Math.floor(seconds_since_post/(24*60*60)); var df = (Math.log(ageing+days_since_post) / (Math.log(ageing))); var likes_per_day = (e.node.edge_liked_by.count / df); // console.log("likes: " + e.node.edge_liked_by.count); //console.log("df: " + df); //console.log("likes_per_day: " + likes_per_day); //return (likes_per_day > 10 * 1000); var obj = {}; obj.url = e.node.display_url; obj.likes_per_day = likes_per_day; obj.days_since_post = days_since_post; obj.total_likes = e.node.edge_liked_by.count; return obj; } )); pop_posts.sort(function (b,a) { if (a.likes_per_day < b.likes_per_day) return -1; if (a.likes_per_day > b.likes_per_day) return 1; return 0; }); //console.log(pop_posts); pop_posts.forEach(function (obj) { console.log(obj.url); }); } if (posts.page_info.has_next_page) { GetPostsFromUser(user_id, first, posts.page_info.end_cursor); } } else { console.log( "ERROR: Posts AJAX call not returned good..." ); } }) .catch(function (err) { console.log( "ERROR: " + err ); }); } } ScrapeInstagramPage ({username : "dress_blouse_designer", debug : false});
Loading…

no comments

    sign in to comment