Nasty.Language.English (Nasty v0.3.0)

English language implementation.

Provides full NLP pipeline for English text:

Tokenization (NimbleParsec-based)
POS tagging (rule-based with Universal Dependencies tags)
Morphological analysis (lemmatization + features)
Parsing (placeholder - returns tokens as document)

Summary

Functions

answer_question(document, question_text, opts \\ [])

Answers a question based on a document.

classify(document, model, opts \\ [])

Classifies a document using a trained model.

explain_code(code, opts \\ [])

Explains Elixir code in natural language.

explain_code_to_document(ast, opts \\ [])

Explains Elixir code and returns a natural language AST Document.

extract_events(document, opts \\ [])

Extracts events from a document.

extract_features(document, opts \\ [])

Extracts classification features from a document.

extract_relations(document, opts \\ [])

Extracts semantic relations between entities in a document.

extract_templates(document, templates, opts \\ [])

Extracts information using templates.

label_semantic_roles(document)

Performs semantic role labeling on a document.

recognize_intent(text, opts \\ [])

Recognizes intent from natural language text.

resolve_coreference(document)

Performs coreference resolution on a document.

summarize(document, opts \\ [])

Summarizes a document by extracting important sentences.

to_code(text, opts \\ [])

Converts natural language to Elixir code.

to_code_ast(text, opts \\ [])

Converts natural language to Elixir AST.

train_classifier(training_data, opts \\ [])

Trains a text classifier on labeled documents.

Functions

answer_question(document, question_text, opts \\ [])

@spec answer_question(Nasty.AST.Document.t(), String.t(), keyword()) ::
  {:ok, [Nasty.AST.Answer.t()]} | {:error, term()}

Answers a question based on a document.

Takes a question as text, analyzes it to determine type and expected answer, then searches the document for relevant passages and extracts answer spans.

Options

:max_answers - Maximum number of answers to return (default: 3)
:min_confidence - Minimum confidence threshold (default: 0.3)
:max_answer_length - Maximum answer length in tokens (default: 20)

Examples

iex> {:ok, document} = English.parse(tagged_tokens)
iex> {:ok, answers} = English.answer_question(document, "Who founded Google?")
iex> is_list(answers)
true

iex> {:ok, answers} = English.answer_question(document, "When was the company founded?", max_answers: 1)
iex> hd(answers).answer_type
:time

classify(document, model, opts \\ [])

@spec classify(Nasty.AST.Document.t(), Nasty.AST.ClassificationModel.t(), keyword()) ::
  {:ok, [Nasty.AST.Classification.t()]} | {:error, term()}

Classifies a document using a trained model.

Returns classifications sorted by confidence.

Examples

iex> {:ok, document} = English.parse(tokens)
iex> {:ok, classifications} = English.classify(document, model)
iex> [top | _rest] = classifications
iex> top.class
:spam

explain_code(code, opts \\ [])

@spec explain_code(
  String.t() | Macro.t(),
  keyword()
) :: {:ok, String.t()} | {:error, term()}

Explains Elixir code in natural language.

Takes Elixir code (string or AST) and generates a natural language explanation.

Examples

iex> {:ok, explanation} = English.explain_code("Enum.sort(numbers)")
iex> explanation
"sort numbers"

iex> {:ok, explanation} = English.explain_code("x = 5")
iex> explanation
"X is 5"

explain_code_to_document(ast, opts \\ [])

@spec explain_code_to_document(
  Macro.t(),
  keyword()
) :: {:ok, Nasty.AST.Document.t()} | {:error, term()}

Explains Elixir code and returns a natural language AST Document.

Examples

iex> ast = quote do: Enum.sort(list)
iex> {:ok, document} = English.explain_code_to_document(ast)
iex> document.language
:en

extract_events(document, opts \\ [])

@spec extract_events(
  Nasty.AST.Document.t(),
  keyword()
) :: {:ok, [Nasty.AST.Event.t()]}

Extracts events from a document.

Options

:min_confidence - Minimum confidence threshold (default: 0.5)
:max_events - Maximum events to return (default: unlimited)
:event_types - List of event types to extract (default: all)

Examples

iex> {:ok, document} = English.parse(tokens)
iex> {:ok, events} = English.extract_events(document)
iex> hd(events).type
:business_acquisition

extract_features(document, opts \\ [])

@spec extract_features(
  Nasty.AST.Document.t(),
  keyword()
) :: map()

Extracts classification features from a document.

Options

:features - Feature types (default: [:bow, :ngrams])
:ngram_size - N-gram size (default: 2)
:min_frequency - Minimum frequency (default: 1)

Examples

iex> features = English.extract_features(document)
iex> is_map(features)
true

extract_relations(document, opts \\ [])

@spec extract_relations(
  Nasty.AST.Document.t(),
  keyword()
) :: {:ok, [Nasty.AST.Relation.t()]}

Extracts semantic relations between entities in a document.

Options

:min_confidence - Minimum confidence threshold (default: 0.5)
:max_relations - Maximum relations to return (default: unlimited)
:relation_types - List of relation types to extract (default: all)

Examples

iex> {:ok, document} = English.parse(tokens)
iex> {:ok, relations} = English.extract_relations(document)
iex> hd(relations).type
:works_at

extract_templates(document, templates, opts \\ [])

@spec extract_templates(
  Nasty.AST.Document.t(),
  [Nasty.Language.English.TemplateExtractor.template()],
  keyword()
) :: {:ok, [Nasty.Language.English.TemplateExtractor.extraction_result()]}

Extracts information using templates.

Arguments

document - Document to extract from
templates - List of template definitions
opts - Options

Options

:min_confidence - Minimum confidence threshold (default: 0.5)
:max_results - Maximum results to return (default: unlimited)

Examples

iex> templates = [TemplateExtractor.employment_template()]
iex> {:ok, results} = English.extract_templates(document, templates)
iex> hd(results).template
"employment"

label_semantic_roles(document)

@spec label_semantic_roles(Nasty.AST.Document.t()) ::
  {:ok, [Nasty.AST.Semantic.Frame.t()]} | {:error, term()}

Performs semantic role labeling on a document.

Extracts predicate-argument structure for all sentences.

Examples

iex> {:ok, frames} = English.label_semantic_roles(document)
iex> is_list(frames)
true

recognize_intent(text, opts \\ [])

@spec recognize_intent(
  String.t(),
  keyword()
) :: {:ok, Nasty.AST.Intent.t()} | {:error, term()}

Recognizes intent from natural language text.

This is a lower-level function that extracts the semantic intent without generating code. Useful for understanding what action the user wants to perform.

Examples

iex> {:ok, intent} = English.recognize_intent("Sort the numbers")
iex> intent.type
:action
iex> intent.action
"sort"

resolve_coreference(document)

@spec resolve_coreference(Nasty.AST.Document.t()) ::
  {:ok, [Nasty.AST.Semantic.CorefChain.t()]} | {:error, term()}

Performs coreference resolution on a document.

Links mentions (pronouns, proper names, definite NPs) into coreference chains.

Examples

iex> {:ok, chains} = English.resolve_coreference(document)
iex> is_list(chains)
true

summarize(document, opts \\ [])

@spec summarize(
  Nasty.AST.Document.t(),
  keyword()
) :: [Nasty.AST.Sentence.t()]

Summarizes a document by extracting important sentences.

Options

:ratio - Compression ratio (0.0 to 1.0), default 0.3
:max_sentences - Maximum number of sentences in summary
:min_sentence_length - Minimum sentence length (in tokens)
:method - Selection method: :greedy or :mmr (default: :greedy)
:mmr_lambda - MMR diversity parameter, 0-1 (default: 0.5)

Examples

iex> document = English.parse("Long text...")
iex> summary_sentences = English.summarize(document, max_sentences: 3)
iex> is_list(summary_sentences)
true

# With MMR to reduce redundancy
iex> summary = English.summarize(document, max_sentences: 5, method: :mmr)
iex> length(summary) <= 5
true

to_code(text, opts \\ [])

@spec to_code(
  String.t(),
  keyword()
) :: {:ok, String.t()} | {:error, term()}

Converts natural language to Elixir code.

Takes a natural language command and generates executable Elixir code.

Options

:enhance_with_ragex - Use Ragex for context-aware suggestions (default: false)

Examples

iex> {:ok, code} = English.to_code("Sort the numbers")
iex> code
"Enum.sort(numbers)"

iex> {:ok, code} = English.to_code("Filter users where age is greater than 18")
iex> code
"Enum.filter(users, fn item -> item > 18 end)"

to_code_ast(text, opts \\ [])

@spec to_code_ast(
  String.t(),
  keyword()
) :: {:ok, Macro.t()} | {:error, term()}

Converts natural language to Elixir AST.

Similar to to_code/2 but returns the Elixir AST instead of a string.

Examples

iex> {:ok, ast} = English.to_code_ast("Sort the list")
iex> Macro.to_string(ast)
"Enum.sort(list)"

train_classifier(training_data, opts \\ [])

@spec train_classifier(
  [{Nasty.AST.Document.t(), atom()}],
  keyword()
) :: Nasty.AST.ClassificationModel.t()

Trains a text classifier on labeled documents.

Arguments

training_data - List of {document, class} tuples
opts - Training options

Options

:features - Feature types to extract (default: [:bow])
:smoothing - Smoothing parameter (default: 1.0)
:min_frequency - Minimum feature frequency (default: 2)

Examples

iex> training_data = [
...>   {spam_doc, :spam},
...>   {ham_doc, :ham}
...> ]
iex> model = English.train_classifier(training_data)
iex> model.algorithm
:naive_bayes