'''
Defines classes and functions for parsing bodies of text to find words and 
prepare them for output to the user.

The top-level functions in this module are optimized to build L{Word}s from
bodies of text containing more than a single L{Word}. A chunking scheme based on
the average length of words in the English language reduces the number of calls
to L{Word.append} and generally outperforms single character at a time
processing (at least for English text).

@todo: PP: can we optimize this with a classifier dictionary for the current
  state that grows over time and across words?

@var VOWELS: Vowels in the English language used to determine if a word can be
  spoken
@type VOWELS: string

@author: Peter Parente
@author: Larry Weiss
@organization: IBM Corporation
@copyright: Copyright (c) 2005 IBM Corporation
@license: Common Public License 1.0

All rights reserved. This program and the accompanying materials are made
available under the terms of the Common Public License v1.0 which accompanies
this distribution, and is available at
U{http://www.opensource.org/licenses/cpl1.0.php}
'''
import unicodedata as ud
import LSRConstants as lc
from POR import POR
from i18n import _

# define constants
VOWELS = _('AEIOUYaeiouy')

def getContextFromString(string, state, por):
  '''  
  Gets the previous, current, and next L{Word}s relative to the given L{POR}. If
  any word is missing, a None value is returned in its place. The string is 
  considered to be at the zero offset of the Item indicated by the L{POR}.
  
  @param string: Text to parse for words
  @type string: string
  @param state: System settings used to define a word
  @type state: L{LSRSettings}  
  @param por: Point of regard indicating the source accessible and Item for the
    string
  @return: Previous, current, and next words surrounding the L{POR}
  @rtype: 3-tuple of L{Word}
  '''
  words = buildWordsFromString(string, state, por)
  c_off = por.char_offset
  # handle degenerate cases of no words or one word
  if len(words) == 0:
    return None, None, None
  elif len(words) == 1:
    return None, words[0], None
  # handle 0 to len(words)-2 cases by searching
  for i, w in enumerate(words):
    w_off = w.getPOR().char_offset
    if w_off > c_off:
      if i == 1:
        # the por is in the first word
        return None, words[0], words[1]
      else:
        return words[i-2], words[i-1], words[i]
  # handle the case when the por is in the last word
  return words[i-1], words[i], None

def buildWordsFromString(string, state, por=None):
  '''
  Parses the given string to build a list of L{Word}s using the given state and
  the given L{POR}. When no L{POR} is given a dummy POR is constructed. Each
  L{Word} constructed will use the provided or constructed L{POR} to indicate 
  it's position as if the string was from the same component and Item. The 
  character offset from the given or constructed L{POR} is not used. The string
  is always considered to be at the zero offset of the Item indicated by the 
  L{POR}.
  
  @param string: Text to parse for words
  @type string: string
  @param state: System settings used to define a word
  @type state: L{LSRSettings}  
  @param por: Point of regard indicating the source accessible and Item for the
    string
  @type por: L{POR}
  @return: L{Word}s parsed from the string
  @rtype: list of L{Word}
  '''
  # keep track of the number of characters parsed so far
  count = 0
  # initialize an empty list
  words = []
  if por is None:
    # build a default POR when none provided
    por = POR(None, None, 0)
  else:
    # else use the accessible and item info from the given POR
    por = POR(por.accessible, por.item_offset, 0)
  # build a first, empty word
  w = Word(state, por)
  # split the source text into chunks the size of the average word length in the
  # English language
  chunks = (string[i:i+6] for i in xrange(0, len(string), 6))
  for chunk in chunks:
    # loop over all chunks in the source text
    while chunk is not None:
      # try to append all of this chunk to the current word
      chunk = w.append(chunk)
      if chunk is not None:
        # store this word
        words.append(w)
        # update the char count
        count += w.getSourceLength()
        # create a new word and set its POR
        w = Word(state, POR(por.accessible, por.item_offset, count))
  words.append(w)
  return words

class Word(object):
  '''
  Represents a word in a body of text. Each L{Word} has a main and a trailing
  part where the main part is processed according to other flags in the
  current L{LSRSettings} to improve its presentation to the user via a speech or
  other output device while the trailing part remains unprocessed. The value
  of L{LSRSettings}.WordDef determines what characters lie in the main and 
  trailing parts of each word. The following constants are available in 
  L{LSRConstants}.
  
    - WORD_NON_BLANK: All non-blank characters are added to the main part
    - WORD_ALPHABETIC: All characters considered letters in the current locale
      are added to the main part  
    - WORD_ALPHA_NUMERIC: All characters considered letters and digits in the
      current locale are added to the main part
    - WORD_ALPHA_PUNCT: All characters considered letters and punctuation in the
      current locale are added to the main part
    - WORD_ALPHA_NUMERIC_PUNCT: All characters considered letters, digits, and
      punctuation in the current locale are added to the main part
  
  Characters in the L{LSRSettings}.Ignore list are considered blank. A L{POR} 
  can be associated with a L{Word} to indicate its context in a larger body of 
  text.

  @ivar state: LSR settings that determine the definition of a L{Word} and how
    it is prepared for output
  @type state: L{LSRSettings}
  @ivar por: Point of regard indicating where this L{Word} originated
  @type por: L{POR}
  @ivar source_word: Original text of this L{Word} without any preparation for
    output applied
  @type source_word: list
  @ivar main_part: Part of this L{Word} that will receive extra preparation for
    output
  @type main_part: list
  @ivar trail_part: Part of the word that will receive little preparation for 
    output
  @type trail_part: list
  @ivar main_done: Is the L{main_part} complete?
  @type main_done: boolean
  @ivar trail_done: Is the L{trail_part} complete?
  @type trail_done: boolean
  @ivar more: Are there likely more L{Word}s after this one in the text source
    where this L{Word} originated?
  @type more: boolean
  @ivar curr_repeat: Indicates a character should be considered a repeat iff
    this value > L{LSRSettings}.MaxRepeat. It is not the exact number of 
    repetitions of a character as it is optimized for speed, not accuracy    
  @type curr_repeat: integer
  @ivar last_char: Last character appended to this L{Word}
  @type last_char: string
  '''
  def __init__(self, state, por):
    '''
    Stores the L{LSRSettings} of LSR and initializes all instance variables.
    
    @param state: State of LSR that defines L{Word}s and how they are processed
    @type state: L{LSRSettings}
    @param por: Point of regard indicating where this L{Word} originated
    @type por: L{POR}
    '''
    self.state = state
    self.por = por
    self.source_word = []
    self.main_part = []
    self.trail_part = []
    self.main_done = False
    self.trail_done = False
    self.more = False
    self.curr_repeat = 0
    self.last_char = None

  def __eq__(self, other):
    '''
    Compares this L{Word} to the one provided based on their L{POR}s and 
    content. If their L{source_word}s and L{POR}s are the same, they are 
    considered equal.
    
    @param other: Word to compare
    @type other: L{Word}
    '''
    return (self.por == other.por) and (self.source_word == other.source_word)
  
  def __unicode__(self):
    '''
    Gets this L{Word} as a string.
    
    @return: Main part of the string joined with the trail
    @rtype: string
    '''
    return u''.join(self.main_part+self.trail_part)
  
  def _isMainChar(self, ch):
    '''
    Determines if the given character should be considered a part of the main
    part of this word or not based on the definition of the word given by
    L{LSRSettings}.
    
    @param ch: Character to test
    @type ch: string
    '''
    if self.isMarker(ch):
      return True
    if self.state.WordDef == lc.WORD_NON_BLANK:
      return not self.isBlank(ch)
    elif self.state.WordDef == lc.WORD_ALPHABETIC:
      return self.isAlpha(ch)
    elif self.state.WordDef == lc.WORD_ALPHA_NUMERIC:
      return self.isAlpha(ch) or self.isNumeric(ch)
    elif self.state.WordDef == lc.WORD_ALPHA_PUNCT:
      return self.isAlpha(ch) or self.isPunctuation(ch)
    elif self.state.WordDef == lc.WORD_ALPHA_NUMERIC_PUNCT:
      return self.isAlpha(ch) or self.isNumeric(ch) or self.isPunctuation(ch)
    else:
      return False
  
  def setPOR(self, por):
    '''
    Associates the start of this L{Word} with the given L{POR}.
    
    @param por: Point of regard pointing to the start of this word
    @type por: L{POR}
    '''
    self.por = por
    
  def getPOR(self):
    '''
    Gets the L{POR} associated with the start of this L{Word}.
    
    @return: Point of regard pointing to the start of this word
    @rtype: L{POR}
    '''
    return self.por
    
  def isBlank(self, ch):
    '''
    Determines if the given character is blank or ignored.
    
    @param ch: Character to test
    @type ch: string
    @return: Is the character a blank?
    @rtype: boolean
    '''
    return ch.isspace() or ch in self.state.Ignore
  
  def isAlpha(self, ch):
    '''
    Determines if the given character is a letter in the current locale.
    
    @param ch: Character to test
    @type ch: string
    @return: Is the character a letter?
    @rtype: boolean
    '''
    return ch.isalpha()
    
  def isNumeric(self, ch):
    '''
    Determines if the given character is a number in the current locale.
    
    @param ch: Character to test
    @type ch: string
    @return: Is the character a number?
    @rtype: boolean
    '''
    return ch.isdigit()
  
  def isPunctuation(self, ch):
    '''
    Determines if the given character is a punctuation mark.
    
    @param ch: Character to test
    @type ch: string
    @return: Is the character a punctuation mark?
    @rtype: boolean
    '''
    cat = ud.category(unicode(ch))
    return (cat == 'Lm' or cat[0] in ['M', 'P', 'S'])
  
  def isSymbol(self, ch):
    '''
    Determines if the given character is a symbol.
    
    @param ch: Character to test
    @type ch: string
    @return: Is the character a symbol?
    @rtype: boolean
    '''
    return ud.category(unicode(ch)).startswith('C')
    
  def isIcon(self, ch):
    '''
    Determines if the given character is an L{AccessEngine} icon.

    @param ch: Character to test
    @type ch: string
    @return: Is the character an icon?
    @rtype: boolean
    '''
    return ch == lc.CHAR_ICON
  
  def isMarker(self, ch):
    '''
    Determines if the given character is an L{AccessEngine} output marker.

    @param ch: Character to test
    @type ch: string
    @return: Is the character an icon?
    @rtype: boolean
    '''
    return ch in [lc.CHAR_ICON, lc.CHAR_NULL, lc.CHAR_PROXY, lc.CHAR_TABLE_CELL,
                  lc.CHAR_VALUE_CHANGER]
  
  def isNull(self, ch):
    '''
    Determines if the given character is an L{AccessEngine} ?.

    @param ch: Character to test
    @type ch: string
    @return: Is the character an icon?
    @rtype: boolean
    '''    
    return ch == lc.CHAR_NULL
  
  def isVowel(self, ch):
    '''
    Determines if the given character is a vowel. Currently only checks for
    vowels in the Latin alphabet.

    @param ch: Character to test
    @type ch: string
    @return: Is the character a Latin vowel?
    @rtype: boolean
    '''  
    return ch in VOWELS
  
  def isCap(self, ch):
    '''
    Determines if the given character is an upper case letter.

    @param ch: Character to test
    @type ch: string
    @return: Is the character capitalized?
    @rtype: boolean
    '''      
    return ch.isupper()
  
  def getCharDescription(self, ch):
    '''
    Gets a localized description of the given character. The most detailed
    description for a character is returned so that, for instance, 'e' is
    described as a vowel and not just a letter.

    @param ch: Character to test
    @type ch: string
    @return: Localized description of the character according to the processing
      done by this L{Word} class and based on the current state
    @rtype: boolean
    '''  
    if ch in self.state.SpelledChars:
      return _('Spelled')
    elif ch in self.state.Ignore:
      return _('Ignored')
    elif self.isBlank(ch):
      return _('Blank')
    elif self.isAlpha(ch):
      if self.isCap(ch):
        return _('Capital')
      elif self.isVowel(ch):
        return _('Vowel')
      else:
        return _('Letter')
    elif self.isNumeric(ch):
      return _('Number')
    elif self.isPunctuation(ch):
      return _('Punctuation')
    elif self.isMarker(ch):
      return _('Marker')
    elif self.isSymbol(ch):
      return _('Symbol')      
  
  def getSource(self):
    '''
    Gets the unprocessed text of this word as it was seen in the original text
    source.
    
    @return: Parsed word without any processing applied
    @rtype: string
    '''
    return self.source_word
  
  def getSourceLength(self):
    '''
    Gets the length of the unprocessed source text of this L{Word}.
    
    @return: Length of the L{source_word}
    @rtype: integer
    '''
    return len(self.source_word)
  
  def getMainLength(self):
    '''
    Gets the length of the processed main part of this L{Word}.
    
    @return: Length of the L{main_part}
    @rtype: integer
    '''
    return len(self.main_part)
  
  def moreAvailable(self):
    '''
    Makes a guess as to whether or not there are more L{Word}s in the body of
    text from which this word originated. This guess is based on whether or not
    the last chunk passed to L{append} was processed in full.
    
    @return: Are there likely more L{Word}s in the original body of text
    @rtype: boolean
    '''
    return self.more
  
  def hasCap(self):
    '''
    Gets if this L{Word} contains an uppercase letter or not.
    
    @return: Does this L{Word} contain a capital letter?
    @rtype: boolean
    '''
    for ch in self.source_word:
      if self.isCap(ch):
        return True
    return False
  
  def hasVowel(self):
    '''
    Gets if this L{Word} contains a vowel or not.
    
    @return: Does this L{Word} contain a vowel?
    @rtype: boolean
    '''
    for ch in self.source_word:
      if self.isVowel(ch):
        return True
    return False
  
  def isAllCaps(self):
    '''
    Gets if this L{Word} is all capitals or not.
    
    @return: Is this L{Word} all capital letters?
    @rtype: boolean
    '''
    return self.isCap(self.source_word)
  
  def isAllNumeric(self):
    '''
    Gets if this L{Word} is all numbers or not.
    
    @return: Is this L{Word} all numbers?
    @rtype: boolean
    '''
    return self.isNumeric(self.source_word)
  
  def isAllBlank(self):
    '''
    Gets if this L{Word} is all blanks or not.
    
    @return: Is this L{Word} all blanks?
    @rtype: boolean
    '''
    for ch in self.source_word:
      if not self.isBlank(ch):
        return False
    return True
  
  def shouldBeSpelled(self):
    '''
    Gets if this L{Word} should be spelled based on if it is a single character,
    if L{LSRSettings}.ForceSpell is on and the word is not all numbers and does
    not have a vowel, if L{LSRSettings}.SpellCaps is on and the word is all 
    caps, and if the word has a repeating letter and is all numbers.
    
    @return: Should this L{Word} be spelled rather than sounded out as written?
    @rtype: boolean
    '''
    if (self.state.Format == lc.FORMAT_SPELL or 
        self.state.Format == lc.FORMAT_PHONETIC):
      return True
    elif not self.isAllBlank():
      if (self.getMainLength() == 1 and 
          (self.state.ForceSpell and not self.state.Format == lc.FORMAT_TEXT)):
        # single character
        return True
      elif (self.state.ForceSpell and 
            (not self.isAllNumeric() and not self.hasVowel())):
        # force spelling of words without vowels
        return True
      elif self.state.SpellCaps and self.isAllCaps():
        # force spelling of acronyms
        return True
      elif self.curr_repeat > self.state.MaxRepeat and not self.isAllNumeric():
        # for spelling of words with repeats in them
        return True
    return False
  
  def append(self, chunk):
    '''
    Parses the given chunk of text for characters that should be added to the
    L{main_part} or L{trail_part} of this L{Word}. If this word has neither
    L{main_done} or L{trail_done} set, then all main characters determined by
    L{_isMainChar} up to the first non-main character are added to the main part
    of this word. When the first non-main word is encountered, L{main_done} is 
    set. If this word has L{main_done} set and L{trail_done} unset, all non-main
    characters are added to the trail part of this word. When another main 
    character is encountered after L{main_done} is set, L{trail_done} is set 
    and the remainder of the given chunk is returned unprocessed to be added
    to another L{Word}. Once L{trail_done} is set, no further text can be 
    appended to this L{Word}.
    
    @param chunk: Chunk of text to parse for words
    @type chunk: string
    @return: Unprocessed portion of the chunk or None if fully processed
    @rtype: string or None
    @see: L{_processMain}
    @see: L{_processTrail}
    '''
    if self.trail_done:
      # don't add anything new after the trail is complete
      return chunk
    for i, ch in enumerate(chunk):
      mc = self._isMainChar(ch)
      if mc:
        if self.main_done:
          # not accepting more main characters
          self.trail_done = True
          self.more = True
          return chunk[i:]
        else:
          # process a new main character
          ch = self._processMain(ch)
      else:
        # process a new trail character
        ch = self._processTrail(ch)
        self.main_done = True
      # detect character repeitions for everything except the ellipsis
      # this method does not keep accurate track of the number of repetitions,
      # only that some character was repeated more than REPEAT number of times
      # in this word
      if ch == self.last_char and ch != '.':
        self.curr_repeat += 1
      elif self.curr_repeat < self.state.MaxRepeat:
        self.last_char = ch
        self.curr_repeat = 1
    return None
  
  def _processMain(self, ch):
    '''
    Adds the given character to the L{source_word}. If L{LSRSettings}.Caps is 
    unset, makes the character lowercase. If {LSRSettings.CapExpand} and the 
    character is a capital letter or {LSRSettings.NumExpand} and the character 
    is a number, inserts a space in L{main_part}. Finally inserts the possibly 
    lowercased character in L{main_part}.
    
    @param ch: Character to process
    @type ch: string
    @return: Character inserted in L{trail_part}
    @rtype: string
    '''
    self.source_word.append(ch)
    if not self.state.Caps:
      ch = ch.lower()
    if self.state.CapExpand and self.isCap(ch) and self.getMainLength():
      self.main_part.append(' ')
    elif self.state.NumExpand and self.isNumeric(ch) and self.getMainLength():
      self.main_part.append(' ')
    self.main_part.append(ch)
    return ch
            
  def _processTrail(self, ch):
    '''
    Adds the given character to the L{source_word}. If the character is a blank,
    inserts a space in L{trail_part}, else inserts the character.
    
    @param ch: Character to process
    @type ch: string
    @return: Character inserted in L{trail_part}
    @rtype: string
    '''
    self.source_word.append(ch)
    if self.isBlank(ch):
      ch = ' '
    self.trail_part.append(ch)
    return ch
  
#if __name__ == '__main__':
  #import sys
  #def printWords(words):
    #for w in words:
      #sys.stdout.write(str(w))
      #print
    #for w in words:
      #print w.getPOR()
  
  #class TestState(object):
    #def __init__(self, **kwargs):
      #self.__dict__.update(kwargs)

  #test = "JAWS is a screen reader. I'd like $10.00 for free. Hey--look at that!"
  #state = TestState(WordDef=lc.WORD_ALPHA_NUMERIC, Caps=True, CapExpand=False, 
                    #NumExpand=True, Ignore=[], MaxRepeat=3)
  #printWords(buildWordsFromString(test, state))
  #for i in range(len(test)):
    #p, c, n = buildContextFromString(test, state, POR(None, None, i))
    #try: print p.getPOR(), p,
    #except: print 'no prev',
    #try: print c.getPOR(), c,
    #except: print 'no current',
    #try: print n.getPOR(), n,
    #except: print 'no next',
    #print