View Source SpiderMan behaviour (spider_man v0.6.3)

SpiderMan, a fast high-level web crawling & scraping framework for Elixir.

Components

Each Spider had 3 components, each component has theirs work:

Message flow: Downloader -> Spider -> ItemProcessor.

Spider Life Cycle

  1. Spider.settings()
  2. Prepare For Start Stage
    1. Spider.prepare_for_start(:pre, state)
    2. Spider.prepare_for_start_component(:downloader, state)
    3. Spider.prepare_for_start_component(:spider, state)
    4. Spider.prepare_for_start_component(:item_processor, state)
    5. Spider.prepare_for_start(:post, state)
  3. Spider.init(state)
  4. Spider.handle_response(response, context)
  5. Prepare For Stop Stage
    1. Spider.prepare_for_stop_component(:downloader, state)
    2. Spider.prepare_for_stop_component(:spider, state)
    3. Spider.prepare_for_stop_component(:item_processor, state)
    4. Spider.prepare_for_stop(state)

Summary

Types

  • :print_stats (boolean/0) - Print the stats of spider, The default value is true.

Functions

fetch spider's statistics of all ets

fetch spider's state

insert a request to spider

insert multiple requests to spider

list spiders where already started

fetch spider's statistics

fetch component's statistics

fetch spider's status

stop a spider

Types

component()

@type component() :: :downloader | :spider | :item_processor

ets_stats()

@type ets_stats() :: [size: pos_integer(), memory: pos_integer()] | nil

prepare_for_start_stage()

@type prepare_for_start_stage() :: :pre | :post

request()

@type request() :: SpiderMan.Request.t()

requests()

@type requests() :: [request()]

settings()

@type settings() :: keyword()
  • :print_stats (boolean/0) - Print the stats of spider, The default value is true.

  • :log2file - Save the log to files, The default value is true.

  • :status - Set the startup status for the spider, The default value is :running.

  • :spider (atom/0) - Set the callback module for the spider,

  • :spider_module (atom/0) - Set the callback module for the spider,

  • :callbacks (keyword/0)

  • :ets_file (String.t/0) - Set the filename for the spider, and load spider's state from ets files.

  • :downloader_options (keyword/0) - see Downloader Options.

  • :spider_options (keyword/0) - see Spider Options.

  • :item_processor_options (keyword/0) - see ItemProcessor Options.

Downloader options

  • :requester - The default value is {SpiderMan.Requester.Finch, []}.

  • :producer - The default value is SpiderMan.Producer.ETS.

  • :context (term/0) - The default value is %{}.

  • :processor (keyword/0) - See Processors Options, The default value is [max_demand: 1].

  • :rate_limiting - See Producers Options - rate_limiting, The default value is [allowed_messages: 10, interval: 1000].

  • :pipelines - Each msg will handle by each pipelines, The default value is [SpiderMan.Pipeline.DuplicateFilter].

  • :post_pipelines - Each msg will handle by each pipelines, The default value is [SpiderMan.Pipeline.DuplicateFilter].

Spider options

Batchers options

ItemProcessor options

spider()

@type spider() :: module() | atom()

status()

@type status() :: :running | :suspended

Callbacks

handle_response(t, context)

@callback handle_response(SpiderMan.Response.t(), context :: map()) :: %{
  optional(:requests) => [SpiderMan.Request.t()],
  optional(:items) => [SpiderMan.Item.t()]
}

init(state)

(optional)
@callback init(state) :: state when state: SpiderMan.Engine.state()

prepare_for_start(prepare_for_start_stage, state)

(optional)
@callback prepare_for_start(prepare_for_start_stage(), state) :: state
when state: SpiderMan.Engine.state()

prepare_for_start_component(component, arg2)

(optional)
@callback prepare_for_start_component(component(), options | false) :: options
when options: keyword()

prepare_for_stop(state)

(optional)
@callback prepare_for_stop(SpiderMan.Engine.state()) :: :ok

prepare_for_stop_component(component, options)

(optional)
@callback prepare_for_stop_component(component(), options :: keyword() | false) :: :ok

settings()

(optional)
@callback settings() :: settings()

Functions

check_zero_task?(spider)

components()

@spec components() :: [component()]

continue(spider, timeout \\ :infinity)

@spec continue(spider(), timeout()) :: :ok

continue a spider

ets_stats(spider)

@spec ets_stats(spider()) :: [
  common_pipeline_tid: ets_stats(),
  downloader_tid: ets_stats(),
  failed_tid: ets_stats(),
  spider_tid: ets_stats(),
  item_processor_tid: ets_stats()
]

fetch spider's statistics of all ets

get_state(spider)

@spec get_state(spider()) :: SpiderMan.Engine.state()

fetch spider's state

insert_request(spider, request)

@spec insert_request(spider(), request()) :: true | nil

insert a request to spider

insert_requests(spider, requests)

@spec insert_requests(spider(), requests()) :: true | nil

insert multiple requests to spider

list_spiders()

@spec list_spiders() :: [spider()]

list spiders where already started

retry_failed(spider, max_retries \\ 3, timeout \\ :infinity)

@spec retry_failed(spider(), max_retries :: integer(), timeout()) ::
  {:ok, count :: integer()}

retry failed events for a spider

run_until(spider, settings \\ [], fun)

@spec run_until(spider(), settings(), fun()) :: millisecond :: integer()

run_until_zero(spider, settings \\ [], check_interval \\ 1500)

@spec run_until_zero(spider(), settings(), check_interval :: integer()) ::
  millisecond :: integer()

start(spider, settings \\ [])

@spec start(spider(), settings()) :: Supervisor.on_start_child()

start a spider

stats(spider)

@spec stats(spider()) :: [
  status: status(),
  common_pipeline_tid: ets_stats(),
  downloader_tid: ets_stats(),
  failed_tid: ets_stats(),
  spider_tid: ets_stats(),
  item_processor_tid: ets_stats(),
  throughputs: map()
]

fetch spider's statistics

stats(spider, component)

@spec stats(spider(), component()) :: ets_stats()

fetch component's statistics

status(spider)

@spec status(spider()) :: status()

fetch spider's status

stop(spider)

@spec stop(spider()) :: :ok | {:error, error}
when error: :not_found | :running | :restarting

stop a spider

suspend(spider, timeout \\ :infinity)

@spec suspend(spider(), timeout()) :: :ok

suspend a spider

throughput(spider)

@spec throughput(spider()) :: [component_throughput_info :: map()]