########################################################################
#
#       License: BSD
#       Created: June 08, 2004
#       Author:  Francesc Altet - faltet@carabos.com
#
#       $Source: /home/ivan/_/programari/pytables/svn/cvs/pytables/pytables/tables/Index.py,v $
#       $Id: Index.py 1015 2005-06-17 17:55:14Z faltet $
#
########################################################################

"""Here is defined the Index class.

See Index class docstring for more info.

Classes:

    Index

Functions:


Misc variables:

    __version__


"""

import warnings
import math
import cPickle

import numarray

import tables.hdf5Extension as hdf5Extension
import tables.utilsExtension as utilsExtension
from tables.AttributeSet import AttributeSet
from tables.Atom import Atom
from tables.Leaf import Filters
from tables.IndexArray import IndexArray
from tables.Group import Group
from tables.utils import joinPath

__version__ = "$Revision: 1015 $"

# default version for INDEX objects
#obversion = "1.0"    # initial version
obversion = "2.0"    # indexes moved to a hidden directory

# Python implementations of NextAfter and NextAfterF
#
# These implementations exist because the standard function
# nextafterf is not available on Microsoft platforms.
#
# These implementations are based on the IEEE representation of
# floats and doubles.
# Author:  Shack Toms - shack@livedata.com
#
# Thanks to Shack Toms shack@livedata.com for NextAfter and NextAfterF
# implementations in Python. 2004-10-01

epsilon  = math.ldexp(1.0, -53) # smallest double such that 0.5+epsilon != 0.5
epsilonF = math.ldexp(1.0, -24) # smallest float such that 0.5+epsilonF != 0.5

maxFloat = float(2**1024 - 2**971)  # From the IEEE 754 standard
maxFloatF = float(2**128 - 2**104)  # From the IEEE 754 standard

minFloat  = math.ldexp(1.0, -1022) # min positive normalized double
minFloatF = math.ldexp(1.0, -126)  # min positive normalized float

smallEpsilon  = math.ldexp(1.0, -1074) # smallest increment for doubles < minFloat
smallEpsilonF = math.ldexp(1.0, -149)  # smallest increment for floats < minFloatF

infinity = math.ldexp(1.0, 1023) * 2
infinityF = math.ldexp(1.0, 128)
#Finf = float("inf")  # Infinite in the IEEE 754 standard (not avail in Win)

# A portable representation of NaN
# if sys.byteorder == "little":
#     testNaN = struct.unpack("d", '\x01\x00\x00\x00\x00\x00\xf0\x7f')[0]
# elif sys.byteorder == "big":
#     testNaN = struct.unpack("d", '\x7f\xf0\x00\x00\x00\x00\x00\x01')[0]
# else:
#     raise ValueError, "Byteorder '%s' not supported!" % sys.byteorder
# This one seems better
testNaN = infinity - infinity

# Utility functions
def infType(type, itemsize, sign=0):
    """Return a superior limit for maximum representable data type"""
    if str(type) != "CharType":
        if sign:
            return -infinity
        else:
            return infinity
    else:
        if sign:
            return "\x00"*itemsize
        else:
            return "\xff"*itemsize


# This check does not work for Python 2.2.x or 2.3.x (!)
def IsNaN(x):
    """a simple check for x is NaN, assumes x is float"""
    return x != x

def PyNextAfter(x, y):
    """returns the next float after x in the direction of y if possible, else returns x"""
    # if x or y is Nan, we don't do much
    if IsNaN(x) or IsNaN(y):
        return x

    # we can't progress if x == y
    if x == y:
        return x

    # similarly if x is infinity
    if x >= infinity or x <= -infinity:
        return x

    # return small numbers for x very close to 0.0
    if -minFloat < x < minFloat:
        if y > x:
            return x + smallEpsilon
        else:
            return x - smallEpsilon  # we know x != y

    # it looks like we have a normalized number
    # break x down into a mantissa and exponent
    m, e = math.frexp(x)

    # all the special cases have been handled
    if y > x:
        m += epsilon
    else:
        m -= epsilon

    return math.ldexp(m, e)

def PyNextAfterF(x, y):
    """returns the next IEEE single after x in the direction of y if possible, else returns x"""

    # if x or y is Nan, we don't do much
    if IsNaN(x) or IsNaN(y):
        return x

    # we can't progress if x == y
    if x == y:
        return x

    # similarly if x is infinity
    if x >= infinityF:
        return infinityF
    elif x <= -infinityF:
        return -infinityF

    # return small numbers for x very close to 0.0
    if -minFloatF < x < minFloatF:
        # since Python uses double internally, we
        # may have some extra precision to toss
        if x > 0.0:
            extra = x % smallEpsilonF
        elif x < 0.0:
            extra = x % -smallEpsilonF
        else:
            extra = 0.0
        if y > x:
            return x - extra + smallEpsilonF
        else:
            return x - extra - smallEpsilonF  # we know x != y

    # it looks like we have a normalized number
    # break x down into a mantissa and exponent
    m, e = math.frexp(x)

    # since Python uses double internally, we
    # may have some extra precision to toss
    if m > 0.0:
        extra = m % epsilonF
    else:  # we have already handled m == 0.0 case
        extra = m % -epsilonF

    # all the special cases have been handled
    if y > x:
        m += epsilonF - extra
    else:
        m -= epsilonF - extra

    return math.ldexp(m, e)


def CharTypeNextAfter(x, direction, itemsize):
    "Return the next representable neighbor of x in the appropriate direction."
    # Pad the string with \x00 chars until itemsize completion
    padsize = itemsize - len(x)
    if padsize > 0:
        x += "\x00"*padsize
    xlist = list(x); xlist.reverse()
    i = 0
    if direction > 0:
        if xlist == "\xff"*itemsize:
            # Maximum value, return this
            return "".join(xlist)
        for xchar in xlist:
            if ord(xchar) < 0xff:
                xlist[i] = chr(ord(xchar)+1)
                break
            else:
                xlist[i] = "\x00"
            i += 1
    else:
        if xlist == "\x00"*itemsize:
            # Minimum value, return this
            return "".join(xlist)
        for xchar in xlist:
            if ord(xchar) > 0x00:
                xlist[i] = chr(ord(xchar)-1)
                break
            else:
                xlist[i] = "\xff"
            i += 1
    xlist.reverse()
    return "".join(xlist)


def nextafter(x, direction, type, itemsize):
    "Return the next representable neighbor of x in the appropriate direction."

    if direction == 0:
        return x

    if str(type) == "CharType":
        return CharTypeNextAfter(x, direction, itemsize)
    elif isinstance(numarray.typeDict[type], numarray.IntegralType):
        if direction < 0:
            return x-1
        else:
            return x+1
    elif str(type) == "Float32":
        if direction < 0:
            return PyNextAfterF(x,x-1)
        else:
            return PyNextAfterF(x,x+1)
    elif str(type) == "Float64":
        if direction < 0:
            return PyNextAfter(x,x-1)
        else:
            return PyNextAfter(x,x+1)
    else:
        raise TypeError, "Type %s is not supported" % type


class IndexProps(object):
    """Container for index properties

    Instance variables:

        auto -- whether an existing index should be updated or not after a
            Table append operation
        reindex -- whether the table fields are to be re-indexed
            after an invalidating index operation (like Table.removeRows)
        filters -- the filter properties for the Table indexes

    """

    def __init__(self, auto=1, reindex=1, filters=None):
        """Create a new IndexProps instance

        Parameters:

        auto -- whether an existing index should be reindexed after a
            Table append operation. Defaults is reindexing.
        reindex -- whether the table fields are to be re-indexed
            after an invalidating index operation (like Table.removeRows).
            Default is reindexing.
        filters -- the filter properties. Default are ZLIB(1) and shuffle


            """
        if auto is None:
            auto = 1  # Default
        if reindex is None:
            reindex = 1  # Default
        assert auto in [0, 1], "'auto' can only take values 0 or 1"
        assert reindex in [0, 1], "'reindex' can only take values 0 or 1"
        self.auto = auto
        self.reindex = reindex
        if filters is None:
            self.filters = Filters(complevel=1, complib="zlib",
                                   shuffle=1, fletcher32=0)
        elif isinstance(filters, Filters):
            self.filters = filters
        else:
            raise TypeError, \
"If you pass a filters parameter, it should be a Filters instance."

    def __repr__(self):
        """The string reprsentation choosed for this object
        """
        descr = self.__class__.__name__
        descr += "(auto=%s" % (self.auto)
        descr += ", reindex=%s" % (self.reindex)
        descr += ", filters=%s" % (self.filters)
        return descr+")"

    def __str__(self):
        """The string reprsentation choosed for this object
        """

        return repr(self)

class Index(hdf5Extension.Index, Group):
    _c_classId = 'CINDEX'

    """Represent the index (sorted and reverse index) dataset in HDF5 file.

    It enables to create indexes of Columns of Table objects.

    All Numeric and numarray typecodes are supported except for complex
    datatypes.

    Methods:

        search(start, stop, step, where)
        getCoords(startCoords, maxCoords)
        append(object)

    Instance variables:

        column -- The column object this index belongs to
        type -- The type class for the index.
        itemsize -- The size of the atomic items. Specially useful for
            CharArrays.
        nrows -- The number of slices in index.
        nelements -- The total number of elements in the index.
        nelemslice -- The number of elements per slice.
        chunksize -- The HDF5 chunksize for each slice.
        filters -- The Filters instance for this object.
        dirty -- Whether the index is dirty or not.
        sorted -- The IndexArray object with the sorted values information.
        indices -- The IndexArray object with the sorted indices information.

    """

    def __init__(self, atom = None, column = None, name = None,
                 title = "", filters = None, expectedrows = 1000,
                 testmode = 0, new = True):
        """Create an Index instance.

        Keyword arguments:

        atom -- An Atom object representing the shape, type and flavor
            of the atomic objects to be saved. Only scalar atoms are
            supported.

        column -- The column object to be indexed

        name -- The name for this Index object.

        title -- Sets a TITLE attribute of the Index entity.

        filters -- An instance of the Filters class that provides
            information about the desired I/O filters to be applied
            during the life of this object. If not specified, the ZLIB
            & shuffle will be activated by default (i.e., they are not
            inherited from the parent, that is, the Table).

        expectedrows -- Represents an user estimate about the number
            of row slices that will be added to the growable dimension
            in the IndexArray object. If not provided, the default
            value is 1000 slices.

        """
        super(Index, self).__init__(title, new, filters)

        mydict = self.__dict__

        # The next are needed by the Group protocol
        mydict['name'] = name
        mydict['_v_expectedrows'] = expectedrows
        mydict['_v_version'] = obversion

        # Collect info in case the index is to be created
        if new:
            mydict['atom'] = atom
            mydict['column'] = column

        mydict['testmode'] = testmode

    def _g_create(self):
        """Call the createGroup method in extension to create the
        group on disk. Also set attributes for this group. Finally,
        add the index information as IndexArrays."""

        global obversion

        mydict = self.__dict__
        # Create the Index Group
        mydict['_v_objectID'] =  self._g_createGroup()

        # Set the filters for this object (they are *not* inherited)
        if self._v_new_filters is None:
            # If not filters has been passed in the constructor,
            # set a sensible default, using zlib compression and shuffling
            filters = Filters(complevel = 1, complib = "zlib",
                              shuffle = 1, fletcher32 = 0)
        else:
            filters = self._v_new_filters

        # Create the IndexArray for sorted values
        sorted = IndexArray(self, self.atom,
                            "Sorted Values", filters,
                            self._v_expectedrows, self.testmode)
        # Register the sorted values array
        setattr(self, "sorted", sorted)

        # Create the IndexArray for index values
        indices = IndexArray(self, Atom("Int32", shape=1),
                             "Reverse Indices", filters,
                             self._v_expectedrows, self.testmode)
        # Register the indices array
        setattr(self, "indices", indices)

        # get the remaining variables
        self._getCompleteObject()

    def _g_afterOpen(self):
        """Tidy object tree after its creation."""

        # Call the post-open methods in sorted and indices
        # This has to executed *before* _getCompleteObject
        self.sorted._g_afterOpen()
        self.indices._g_afterOpen()

        # get the remaining variables
        self._getCompleteObject()

    def _getCompleteObject(self):
        "Get the remaining variables for a complete Index object"

        mydict = self.__dict__
        mydict['type'] = self.sorted.type
        mydict['itemsize'] = self.sorted.itemsize
        mydict['chunksize'] = self.sorted.chunksize
        mydict['byteorder'] = self.sorted.byteorder
        mydict['nrows'] = self.sorted.nrows
        mydict['nelemslice'] = self.sorted.nelemslice
        mydict['nelements'] = self.nrows * self.nelemslice
        mydict['shape'] = (self.nrows, self.nelemslice)
        mydict['filters'] = self.sorted.filters

    def append(self, arr):
        """Append the object to this (enlargeable) object"""

        # Save the sorted array
        if str(self.sorted.type) == "CharType":
            s=arr.argsort()
            # Caveat: this conversion is necessary for portability on
            # 64-bit systems because indexes are 64-bit long on these
            # platforms
            self.indices.append(numarray.array(s, type="Int32"))
            self.sorted.append(arr[s])
        else:
            #self.sorted.append(numarray.sort(arr))
            #self.indices.append(numarray.argsort(arr))
            # The next is a 10% faster, but the ideal solution would
            # be to find a funtion in numarray that returns both
            # sorted and argsorted all in one call
            s=numarray.argsort(arr)
            # Caveat: this conversion is necessary for portability on
            # 64-bit systems because indexes are 64-bit long on these
            # platforms
            self.indices.append(numarray.array(s, type="Int32"))
            self.sorted.append(arr[s])
        # Update nrows after a successful append
        mydict = self.__dict__
        mydict['nrows'] = self.sorted.nrows
        mydict['nelements'] = self.nrows * self.nelemslice
        mydict['shape'] = (self.nrows, self.nelemslice)

    def search(self, item):
        """Do a binary search in this index for an item"""
        #t1=time.time()
        ntotaliter = 0; tlen = 0
        mydict = self.__dict__
        mydict['starts'] = []; mydict['lengths'] = []
        #self.irow = 0; self.len1 = 0; self.len2 = 0;  # useful for getCoords()
        self.sorted._initSortedSlice(self.chunksize)
        # Do the lookup for values fullfilling the conditions
        for i in xrange(self.sorted.nrows):
            (start, stop, niter) = self.sorted._searchBin(i, item)
            self.starts.append(start)
            self.lengths.append(stop - start)
            ntotaliter += niter
            tlen += stop - start
        self.sorted._destroySortedSlice()
        #print "time reading indices:", time.time()-t1
        #print "ntotaliter:", ntotaliter
        assert tlen >= 0, "Index.search(): Post-condition failed. Please, report this to the authors."
        return tlen

# This has been ported to Pyrex. However, with pyrex it has the same speed,
# so, it's better to stay here
    def getCoords(self, startCoords, maxCoords):
        """Get the coordinates of indices satisfiying the cuts.

        You must call the Index.search() method before in order to get
        good sense results.

        """
        #t1=time.time()
        len1 = 0; len2 = 0; relCoords = 0
        # Correction against asking too many elements
        nindexedrows = self.nelemslice*self.nrows
        if startCoords + maxCoords > nindexedrows:
            maxCoords = nindexedrows - startCoords
        for irow in xrange(self.sorted.nrows):
            leni = self.lengths[irow]; len2 += leni
            if (leni > 0 and len1 <= startCoords < len2):
                startl = self.starts[irow] + (startCoords-len1)
                # Read maxCoords as maximum
                stopl = startl + maxCoords
                # Correction if stopl exceeds the limits
                if stopl > self.starts[irow] + self.lengths[irow]:
                    stopl = self.starts[irow] + self.lengths[irow]
                self.indices._g_readIndex(irow, startl, stopl, relCoords)
                incr = stopl - startl
                relCoords += incr; startCoords += incr; maxCoords -= incr
                if maxCoords == 0:
                    break
            len1 += leni

        # I don't know if sorting the coordinates is better or not actually
        # Some careful tests must be carried out in order to do that
        #selections = self.indices.arrAbs[:relCoords]
        selections = numarray.sort(self.indices.arrAbs[:relCoords])
        #print "time getting coords:", time.time()-t1
        return selections

# This tries to be a version of getCoords that keeps track of visited rows
# in order to not re-visit them again. However, I didn't managed to make it
# work well. However, the improvement in speed should be not important
# in the majority of cases.
# Beware, the logic behind doing this is not trivial at all. You have been
# warned!. 2004-08-03
#     def getCoords_notwork(self, startCoords, maxCoords):
#         """Get the coordinates of indices satisfiying the cuts"""
#         relCoords = 0
#         # Correction against asking too many elements
#         nindexedrows = self.nelemslice*self.nrows
#         if startCoords + maxCoords > nindexedrows:
#             maxCoords = nindexedrows - startCoords
#         #for irow in xrange(self.irow, self.sorted.nrows):
#         while self.irow < self.sorted.nrows:
#             irow = self.irow
#             leni = self.lengths[irow]; self.len2 += leni
#             if (leni > 0 and self.len1 <= startCoords < self.len2):
#                 startl = self.starts[irow] + (startCoords-self.len1)
#                 # Read maxCoords as maximum
#                 stopl = startl + maxCoords
#                 # Correction if stopl exceeds the limits
#                 rowStop = self.starts[irow] + self.lengths[irow]
#                 if stopl >= rowStop:
#                     stopl = rowStop
#                     #self.irow += 1
#                 self.indices._g_readIndex(irow, startl, stopl, relCoords)
#                 incr = stopl - startl
#                 relCoords += incr
#                 maxCoords -= incr
#                 startCoords += incr
#                 self.len1 += incr
#                 if maxCoords == 0:
#                     break
#             #self.len1 += leni
#             self.irow += 1

#         # I don't know if sorting the coordinates is better or not actually
#         # Some careful tests must be carried out in order to do that
#         selections = numarray.sort(self.indices.arrAbs[:relCoords])
#         #selections = self.indices.arrAbs[:relCoords]
#         return selections

    def getLookupRange(self, column):
        #import time
        table = column.table
        # Get the coordinates for those values
        ilimit = table.opsValues
        ctype = column.type
        sctype = str(ctype)
        itemsize = table.colitemsizes[column.pathname]

        # Check that limits are compatible with type
        for limit in ilimit:
            # Check for strings
            if sctype == "CharType":
                if type(limit) is not str:
                    raise TypeError("""\
Bounds (or range limits) for string columns can only be strings.""")
                else:
                    continue

            nactype = numarray.typeDict[sctype]

            # Check for booleans
            if isinstance(nactype, numarray.BooleanType):
                if type(limit) not in (int, long, bool):
                    raise TypeError("""\
Bounds (or range limits) for bool columns can only be ints or booleans.""")
            # Check for ints
            elif isinstance(nactype, numarray.IntegralType):
                if type(limit) not in (int, long, float):
                    raise TypeError("""\
Bounds (or range limits) for integer columns can only be ints or floats.""")
            # Check for floats
            elif isinstance(nactype, numarray.FloatingType):
                if type(limit) not in (int, long, float):
                    raise TypeError("""\
Bounds (or range limits) for float columns can only be ints or floats.""")
            else:
                raise TypeError("""
Bounds (or range limits) can only be strings, bools, ints or floats.""")

        # Boolean types are a special case for searching
        if sctype == "Bool":
            if len(table.ops) == 1 and table.ops[0] == 5: # __eq__
                item = (ilimit[0], ilimit[0])
                ncoords = self.search(item)
                return ncoords
            else:
                raise NotImplementedError, \
                      "Only equality operator is supported for boolean columns."
        # Other types are treated here
        if len(ilimit) == 1:
            ilimit = ilimit[0]
            op = table.ops[0]
            if op == 1: # __lt__
                item = (infType(type=ctype, itemsize=itemsize, sign=-1),
                        nextafter(ilimit, -1, ctype, itemsize))
            elif op == 2: # __le__
                item = (infType(type=ctype, itemsize=itemsize, sign=-1),
                        ilimit)
            elif op == 3: # __gt__
                item = (nextafter(ilimit, +1, ctype, itemsize),
                        infType(type=ctype, itemsize=itemsize, sign=0))
            elif op == 4: # __ge__
                item = (ilimit,
                        infType(type=ctype, itemsize=itemsize, sign=0))
            elif op == 5: # __eq__
                item = (ilimit, ilimit)
            elif op == 6: # __ne__
                # I need to cope with this
                raise NotImplementedError, "'!=' or '<>' not supported yet"
        elif len(ilimit) == 2:
            item1, item2 = ilimit
            if item1 > item2:
                raise ValueError("""\
On 'val1 <{=} col <{=} val2' selections, \
val1 must be less or equal than val2""")
            op1, op2 = table.ops
            if op1 == 3 and op2 == 1:  # item1 < col < item2
                item = (nextafter(item1, +1, ctype, itemsize),
                        nextafter(item2, -1, ctype, itemsize))
            elif op1 == 4 and op2 == 1:  # item1 <= col < item2
                item = (item1, nextafter(item2, -1, ctype, itemsize))
            elif op1 == 3 and op2 == 2:  # item1 < col <= item2
                item = (nextafter(item1, +1, ctype, itemsize), item2)
            elif op1 == 4 and op2 == 2:  # item1 <= col <= item2
                item = (item1, item2)
            else:
                raise ValueError, \
"Combination of operators not supported. Use val1 <{=} col <{=} val2"

        #t1=time.time()
        ncoords = self.search(item)
        #print "time reading indices:", time.time()-t1
        return ncoords

    def _remove(self):
        """Remove this Index object"""

        if utilsExtension.whichLibVersion("hdf5")[1] == "1.6.3":
            warnings.warn("""\
You are using HDF5 version 1.6.3. It turns out that this precise
version has a bug that causes a seg fault when deleting a chunked
dataset. If you are getting such a seg fault immediately after this
message, please, get a patched version of HDF5 1.6.3, or, better,
get HDF5 1.6.4.""")

        # Delete the associated IndexArrays
        self.sorted.remove()
        self.indices.remove()
        # delete some references
        mydict = self.__dict__
        mydict['indices'] = None
        mydict['sorted'] = None
        # delete self
        self._f_remove()

    def _f_close(self):
        # Check that the index is not already closed
        if self._v_file == None:
            return
        # close the indices (might be already removed by self._g_remove!)
        if hasattr(self,"sorted") and self.sorted <> None:
            self.sorted.close()
        if hasattr(self,"indices") and self.indices <> None:
            self.indices.close()
        # delete some references
        mydict = self.__dict__
        mydict['atom'] = None
        mydict['column'] = None
        mydict['filters'] = None
        mydict['indices'] = None
        mydict['sorted'] = None
        #mydict['_v_attrs'] = None  # This is deleted in Group._f_close()
        # Call the superclass close method
        Group._f_close(self)

    def __str__(self):
        """This provides a more compact representation than __repr__"""
        return "Index(%s, shape=%s, chunksize=%s)" % \
               (self.nelements, self.shape, self.chunksize)

    def __repr__(self):
        """This provides more metainfo than standard __repr__"""

        cpathname = self.column.table._v_pathname + ".cols." + self.column.name
        dirty = self.column.dirty
        return """%s (Index for column %s)
  type := %r
  nelements := %s
  shape := %s
  chunksize := %s
  byteorder := %r
  filters := %s
  dirty := %s
  sorted := %s
  indices := %s""" % (self._v_pathname, cpathname,
                     self.type, self.nelements, self.shape,
                     self.sorted.chunksize, self.sorted.byteorder,
                     self.filters, dirty, self.sorted, self.indices)
