scraper.coffee

Jump To …

README src / data_urls.coffee src / downloader.coffee src / parser.coffee src / scraper.coffee

scraper.coffee src/
¶ External dependencies	whenjs = require 'when' unfoldList = require 'when/unfold/list' cheerio = require 'cheerio'
¶ Internal dependencies	DATA_FEED_URLS = require './data_urls' Parser = require './parser' Downloader = require './downloader'
¶ Pandata
¶ This is the main class of the module, which exposes the API. Use like so: `Pandata = require 'pandata' Pandata.get 'john@example.com', (err, webname) -> scraper = new Pandata webname`	module.exports = class Pandata constructor: (@webname) -> @parser = new Parser
¶ get Get a Pandora webname by searching for a string (such as an email address), and execute a callback on it. The callback must have the signature `(error, webname)`.	@get: (user_id, callback) -> search_url = DATA_FEED_URLS.user_search.replace('%{searchString}', user_id) Downloader.read_page search_url, (data) -> $ = cheerio.load data webnames = Parser.get_webnames_from_search($) if user_id in webnames callback null, user_id else if webnames.length is 1 and /.@.\..*/.test user_id callback null, webnames[0] else if webnames? and webnames[0] isnt undefined callback null, webnames[0] else callback Error("""[Pandata] Couldn't find a Pandora user with that email or webname."""), null
¶ API
¶ recent_activity Get a list of artists and tracks a user has listened to recently. Returns a promise for an array of tracks and artists. A track is an object with `track` and `artist` fields. An artist is just a string. `[ {track: 'Promises', artist: 'The Range'} , 'The Range' , 'MGMT' , {track: 'Kids', artist: 'MGMT'} ]`	recent_activity: -> @_scrape_for('recent_activity', 'get_recent_activity')
¶ playing_station Get the user's currently playing station. Get the station a user is currently playing (this means the one that is currently selected for playback in Pandora; the user may not actually be on pandora.com listening to it.) Returns a promise for the name of the currently playing station as a string: `'The Range Radio'`	playing_station: -> @_scrape_for('playing_station', 'get_playing_station') .then (result) -> return result[0]
¶ stations Get the stations a user has listened to or created. Returns a promise for an array of station names: `['The Range Radio', 'MGMT Radio']`	stations: -> @_scrape_for('stations', 'get_stations')
¶ bookmarks Get tracks and artists a user has bookmarked. Returns a promise for an object of `tracks` and `artists`: `{ tracks: [ {track: 'Promises', artist: 'The Range'} , {track: 'Kids', artits: 'MGMT'} ] , artists: ['The Range', 'MGMT'] }` Also accepts an optional string argument to limit the result to a particular category, either `'tracks'`, `'artists'`, `'stations'`, or `'albums'`. For example, `scraper.bookmarks('tracks')` returns a promise for an array of tracks: `[ {track: 'Promises', artist: 'The Range'} , {track: 'Kids', artist: 'MGMT'} ]`	bookmarks: (bookmark_type = 'all') -> switch bookmark_type when 'tracks' return @_scrape_for('bookmarked_tracks', 'get_bookmarked_tracks') when 'artists' return @_scrape_for('bookmarked_artists', 'get_bookmarked_artists') when 'all'
¶ Wait for all the scraping promises to resolve before combining them into the returned object	whenjs.all([ @bookmarks('artists') @bookmarks('tracks') ]).then( (results) -> artists: results[0] tracks: results[1] (reason) -> console.error reason )
¶ likes Get tracks, artists, stations, and albums a user has liked. Returns a promise for an object containing arrays of tracks, artists, stations, and albums: `{ tracks: [ {track: 'Promises', artist: 'The Range'} , {track: 'Kids', artist: 'MGMT'} ] , artists: ['The Range', 'MGMT'] , stations: ['The Range Radio', 'MGMT Radio'] , albums: [ {album: 'Nonfiction', artist: 'The Range' } , {album: 'Oracular Spectacular', artist: 'MGMT'} ] }` Also accepts an optional string argument to limit the result to a particular category, either `'tracks'`, `'artists'`, `'stations'`, or `'albums'`. For example, `scraper.likes('tracks')` returns a promise for an array of tracks: `[ {track: 'Promises', artist: 'The Range'} , {track: 'Kids', artist: 'MGMT'} ]`	likes: (like_type = 'all') -> switch like_type when 'tracks' @_scrape_for('liked_tracks', 'get_liked_tracks') when 'artists' @_scrape_for('liked_artists', 'get_liked_artists') when 'stations' @_scrape_for('liked_stations', 'get_liked_stations') when 'albums' @_scrape_for('liked_albums', 'get_liked_albums') when 'all'
¶ Wait for all the scraping promises to resolve before combining them into the returned object	whenjs.all([ @likes('artists') @likes('albums') @likes('stations') @likes('tracks') ]).then( (results) -> artists: results[0] albums: results[1] stations: results[2] tracks: results[3] (reason) -> console.error reason )
¶ following Get the Pandora users that follow this user. Returns a promise for an array of user objects: `[ { name: 'Will Mayner' , webname: 'wmayner' , href: '/profile/wmayner' } ]`	following: -> @_scrape_for('following', 'get_following')
¶ followers Get the Pandora users this user is following. Returns a promise for an array of user objects: `[ { name: 'Will Mayner' , webname: 'wmayner' , href: '/profile/wmayner' } ]`	followers: -> @_scrape_for('followers', 'get_followers')
¶ Private methods
¶ Downloads all data of a given type and calls the supplied `Parser` method. Returns a promise for the array of results.	_scrape_for: (data_type, parser_method) ->
¶ This is called iteratively by `unfoldList` as long as `condition` is not met	unspool = (next_data_indices) =>
¶ Must return a promise	deferred = whenjs.defer()
¶ We'll give the resolver to the `Downloader`	resolver = deferred.resolver url = @_get_url data_type, next_data_indices Downloader.read_page url, (data) =>
¶ Check if we're getting XML, use `xmlMode: true` with `cheerio` if so	$ = cheerio.load data, (if /\.xml/.test url then {xmlMode: yes} else {})
¶ Pass the parsed DOM object to the `Parser` method	result = @parser[parser_method]($) next_data_indices = @parser.get_next_data_indices($)
¶ `result` is added to the list, `next_data_indices` is passed to the next `condition` and `unspool` calls	return resolver.resolve([result, next_data_indices]) return deferred.promise
¶ The condition upon which `unfoldList` will stop	condition = (next_data_indices) => return (if next_data_indices? then \ @_is_empty next_data_indices else next_data_indices)
¶ The initial seed for `unfoldList`	initial_data_indices = null seed = initial_data_indices
¶ `unfoldList` handles the sequential execution of an unkown number of iterations of an asynchronous function, which is in this case grabbing data from the Pandora feeds (they may or may not have more pages, which is only known after fetching the current one), and returns a promise for an array of the results.	return unfoldList(unspool, condition, seed) .then(
¶ Resolution	(results) ->
¶ Flatten the resulting array	return [].concat results...
¶ Rejection	(reason) -> console.error reason )
¶ Grab a URL from `DATA_FEED_URLS` and format it appropriately.	_get_url: (data_type, next_data_indices = null) -> unless next_data_indices? next_data_indices = nextStartIndex: 0 nextLikeStartIndex: 0 nextThumbStartIndex: 0
¶ ! We want to set the webname parameter as well	next_data_indices['webname'] = @webname
¶ ! Grab the proper URL	url = DATA_FEED_URLS[data_type]
¶ ! Replace the parameters with values	for url_string_param of next_data_indices url = url.replace( new RegExp("%{"+url_string_param+"}") , next_data_indices[url_string_param]) return url
¶ Utility method to check if an object is empty or not.	_is_empty: (obj) -> if obj is null then return yes if obj.length and obj.length > 0 then return no if obj.length is 0 then return yes for key of obj if hasOwnProperty.call(obj, key) then return no return yes

generated Sun May 18 2014 23:57:49 GMT-0500 (CDT)

scraper.coffee

Pandata

get

API

recent_activity

playing_station

stations

bookmarks

likes

following

followers

Private methods