# ***** BEGIN LICENSE BLOCK *****
# Version: RCSL 1.0/RPSL 1.0/GPL 2.0
#
# Portions Copyright (c) 1995-2002 RealNetworks, Inc. All Rights Reserved.
# Portions Copyright (c) 2004 Robert Kaye. All Rights Reserved.
#
# The contents of this file, and the files included with this file, are
# subject to the current version of the RealNetworks Public Source License
# Version 1.0 (the "RPSL") available at
# http://www.helixcommunity.org/content/rpsl unless you have licensed
# the file under the RealNetworks Community Source License Version 1.0
# (the "RCSL") available at http://www.helixcommunity.org/content/rcsl,
# in which case the RCSL will apply. You may also obtain the license terms
# directly from RealNetworks.  You may not use this file except in
# compliance with the RPSL or, if you have a valid RCSL with RealNetworks
# applicable to this file, the RCSL.  Please see the applicable RPSL or
# RCSL for the rights, obligations and limitations governing use of the
# contents of the file.
#
# This file is part of the Helix DNA Technology. RealNetworks is the
# developer of the Original Code and owns the copyrights in the portions
# it created.
#
# This file, and the files included with this file, is distributed and made
# available on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
# EXPRESS OR IMPLIED, AND REALNETWORKS HEREBY DISCLAIMS ALL SUCH WARRANTIES,
# INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, FITNESS
# FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
#
# Technology Compatibility Kit Test Suite(s) Location:
#    http://www.helixcommunity.org/content/tck
#
# --------------------------------------------------------------------
#
# picard is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# picard is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with picard; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
#
# Contributor(s):
#   Robert Kaye
#
#
# ***** END LICENSE BLOCK *****

import re
import copy
from heapq import heappush, heappop
from tunepimp import metadata
import dataobjs
import albummanager
import wx
import events

class ClusterManager(object):
    '''The ClusterManager keeps the list of existing ClusterAlbum objects'''

    def __init__(self, context):
	self.context = context
	self.clusters = {}
	self.nextid = 1

    def dumpState(self, d):
	d(u"ClusterManager object: %s" % repr(self))
	d = d.nest()
	d(u"number of clusters: %d" % len(self.clusters))
	for c in self.clusters.values():
	    c.dumpState(d)

    def findCluster(self, clusterId):
	return self.clusters[clusterId]

    def getClusters(self):
	return self.clusters.values()

    def addCluster(self, albumcluster):
	albumcluster.setId("cluster%d" % self.nextid)
	self.nextid = self.nextid + 1

	self.clusters[albumcluster.getId()] = albumcluster
	albumcluster.setClusterManager(self)

	wx.PostEvent(self.context.frame, events.ClusterEvent(albumcluster, events.EVT_CLUSTER_ADDED_ID))
	wx.WakeUpIdle()

    def removeCluster(self, albumcluster):
	del self.clusters[albumcluster.getId()]
	albumcluster.setClusterManager(None)
	wx.PostEvent(self.context.frame, events.ClusterEvent(albumcluster, events.EVT_CLUSTER_REMOVED_ID))
	wx.WakeUpIdle()

    def CanCluster(self):
	try:
	    pe = self.context.albummanager.get(albummanager.pendingFiles)
	except KeyError:
	    return False
	try:
	    un = self.context.albummanager.get(albummanager.unmatchedFiles)
	except KeyError:
	    return False

	return(pe.getNumUnmatchedFiles() == 0
		and un.getNumUnmatchedFiles() > 0)

class ClusterAlbum(dataobjs.DataObject):
   '''The ClusterAlbum class is a cluster of files that the cluster algorithm has
      deemed to be related to one another'''
   
   def __init__(self, context, artistName, albumName, fileIdList):
       dataobjs.DataObject.__init__(self, context.config, albumName, "")
       self.fileIdList = fileIdList 
       self.artistName = artistName
       self.clustermanager = None
       self.id = None
       self.context = context

   def dumpState(self, d):
	d(u"ClusterAlbum object: %s" % repr(self))
	d = d.nest()
	d(u"id: %s" % repr(self.id))
	d(u"albumName: %s" % repr(self.name))
	d(u"artistName: %s" % repr(self.artistName))
	d(u"fileIdList: %s" % repr(self.fileIdList))

   def getClusterManager(self):
	return self.clustermanager

   def setClusterManager(self, clustermanager):
	self.clustermanager = clustermanager

   def getArtistName(self):
       return self.artistName

   def getFileIds(self):
       return copy.copy(self.fileIdList)

   def getNumFiles(self):
       return len(self.fileIdList)

   def removeFile(self, tpfile):
	self.fileIdList.remove(tpfile.getFileId())

	wx.PostEvent(self.context.frame, events.ClusterEvent(self, events.EVT_CLUSTER_CHANGED_ID))
	wx.PostEvent(self.context.frame, events.ClusterFileEvent(self, tpfile.getFileId(), events.EVT_CLUSTER_FILE_REMOVED_ID))

	if self.getNumFiles() == 0:
	    self.getClusterManager().removeCluster(self)

	wx.WakeUpIdle()

   def addFile(self, tpfile):
	self.fileIdList.append(tpfile.getFileId())

	wx.PostEvent(self.context.frame, events.ClusterEvent(self, events.EVT_CLUSTER_CHANGED_ID))
	wx.PostEvent(self.context.frame, events.ClusterFileEvent(self, tpfile.getFileId(), events.EVT_CLUSTER_FILE_ADDED_ID))
	wx.WakeUpIdle()

   def moveFilesToMe(self, context):
	# This is a rather ugly hack.  When the cluster is first created, it
	# is passed a list of file IDs; files that "should belong" in this
	# cluster.  However the files themselves (TPFile objects) aren't
	# actually in this cluster yet.  Calling this method moves all those
	# files into this cluster.
	for fileid in self.getFileIds():
	    self.fileIdList.remove(fileid)
	    tpfile = context.tpmanager.findFile(fileid)
	    tpfile.moveToCluster(self)

   def getId(self): return self.id
   def setId(self, id): self.id = id

class ClusterDict(object):
   
    def __init__(self):

        # word -> id index
        self.words = {}

        # id -> word, token index
        self.ids = {}

        # counter for new id generation
        self.id = 0

        self.regexp = re.compile(ur'\W', re.UNICODE)

    def getSize(self):
        return self.id

    def tokenize(self, word):

        return self.regexp.sub(u'', word.lower())

    def add(self, word):

        if word == u'': 
           return -1

        token = self.tokenize(word)
        if token == u'': 
           return -1

        try:
           index, count = self.words[word]
           self.words[word] = (index, count + 1)
        except KeyError:
           index = self.id
           self.words[word] = (self.id, 1)
           self.ids[index] = (word, token)
           self.id = self.id + 1

        return index

    def getWord(self, index):

        word = None;
        try:
            word, token = self.ids[index]
        except KeyError:
            pass

        return word

    def getToken(self, index):

        token = None;
        try:
            word, token = self.ids[index]
        except KeyError:
            pass

        return token

    def getWordAndCount(self, index):

        word = None;
        count = 0
        try:
           word, token = self.ids[index]
           index, count = self.words[word]
        except KeyError:
           pass

        return word, count

class ClusterEngine(object):

    def __init__(self, config, clusterDict):
        # the cluster dictionary we're using
        self.clusterDict = clusterDict
        # keeps track of unique cluster index
        self.clusterCount = 0
        # Keeps track of the clusters we've created
        self.clusterBins = {}
        # Index the word ids -> clusters
        self.idClusterIndex = {}
        self.config = config

    def getClusterFromId(self, id):
        try:
            return self.idClusterIndex[id]
        except:
            return -1

    def printCluster(self, cluster):
        if cluster < 0: 
            print "[no such cluster]"
            return

        bin = self.clusterBins[cluster]
        print cluster, " -> ", ", ".join([("'" + self.clusterDict.getWord(i) + "'") for i in bin])

    def getClusterTitle(self, cluster):

        if cluster < 0: 
            return ""

        max = 0 
        maxWord = u''
        for id in self.clusterBins[cluster]:
            word, count = self.clusterDict.getWordAndCount(id)
            if count >= max:
                maxWord = word
                max = count

        return maxWord

    def cluster(self, threshold):

        # keep the matches sorted in a heap
        heap = []

        mdata = metadata.metadata(self.config.getTunePimp())
        for y in xrange(self.clusterDict.getSize()):
            for x in xrange(y):
                if x != y:
                    c = mdata.similarity(self.clusterDict.getToken(x).lower(), 
                                         self.clusterDict.getToken(y).lower())
                    #print "'%s' - '%s' = %f" % (
                    #    self.clusterDict.getToken(x).encode('utf-8', 'replace').lower(),  
                    #    self.clusterDict.getToken(y).encode('utf-8', 'replace').lower(), c)

                    if c >= threshold:
                        heappush(heap, ((1.0 - c), [x, y]))

        for i in xrange(self.clusterDict.getSize()):
            word, count = self.clusterDict.getWordAndCount(i)
            if word and count > 1:
                self.clusterBins[self.clusterCount] = [ i ]
                self.idClusterIndex[i] = self.clusterCount
                self.clusterCount = self.clusterCount + 1
                #print "init ",
                #self.printCluster(self.clusterCount - 1)

        for i in xrange(len(heap)):
            c, pair = heappop(heap)
            c = 1.0 - c

            try: 
                match0 = self.idClusterIndex[pair[0]]
            except:
                match0 = -1

            try: 
                match1 = self.idClusterIndex[pair[1]]
            except:
                match1 = -1

            # if neither item is in a cluster, make a new cluster
            if match0 == -1 and match1 == -1:
                self.clusterBins[self.clusterCount] = [pair[0], pair[1]]
                self.idClusterIndex[pair[0]] = self.clusterCount
                self.idClusterIndex[pair[1]] = self.clusterCount
                self.clusterCount = self.clusterCount + 1
                #print "new ",
                #self.printCluster(self.clusterCount - 1)
                continue

            # If cluster0 is in a bin, stick the other match into that bin
            if match0 >= 0 and match1 < 0:
                self.clusterBins[match0].append(pair[1]) 
                self.idClusterIndex[pair[1]] = match0
                #print "add '%s' to cluster " % (self.clusterDict.getWord(pair[0])), 
                #self.printCluster(match0)
                continue
               
            # If cluster1 is in a bin, stick the other match into that bin
            if match1 >= 0 and match0 < 0:
                self.clusterBins[match1].append(pair[0]) 
                self.idClusterIndex[pair[0]] = match1
                #print "add '%s' to cluster " % (self.clusterDict.getWord(pair[1])),
                #self.printCluster(match0)
                continue

            # If both matches are already in two different clusters, merge the clusters
            if match1 != match0:
                self.clusterBins[match0].extend(self.clusterBins[match1])
                for match in self.clusterBins[match1]:
                    self.idClusterIndex[match] = match0
                #print "col cluster %d into cluster" % (match1),
                #self.printCluster(match0)
                del self.clusterBins[match1]

        return self.clusterBins

class FileClusterEngine(object):
   
    def __init__(self, context):
        self.context = context
        self.config = context.config

    def getClusterAlbums(self):
        return self.clusters

    def cluster(self, fileIdList, threshold):

        self.clusters = []
        artistDict = ClusterDict()
        albumDict = ClusterDict()
        artists = {}
        albums = {}
        tracks = []

        for fileId in fileIdList:
            tr = self.config.tunePimp.getTrack(fileId)
            tr.lock()
            ldata = tr.getLocalMetadata()
            tr.unlock()                    
            self.config.getTunePimp().releaseTrack(tr)

            tracks.append({ 'file':fileId,
                            'artistIndex':artistDict.add(ldata.artist),
                            'albumIndex':albumDict.add(ldata.album) })

        artistClusterEngine = ClusterEngine(self.config, artistDict)
        artistCluster = artistClusterEngine.cluster(threshold)

        albumClusterEngine = ClusterEngine(self.config, albumDict)
        albumCluster = albumClusterEngine.cluster(threshold) 

        # Arrange tracks into albums
        for i in xrange(len(tracks)):
            cluster = albumClusterEngine.getClusterFromId(tracks[i]['albumIndex'])
            if cluster >= 0:
                try:
                    albums[cluster].append(i)
                except KeyError:
                    albums[cluster] = [ i ]

        # Arrange tracks into artist groups
        for i in xrange(len(tracks)):
            cluster = artistClusterEngine.getClusterFromId(tracks[i]['artistIndex'])
            if cluster >= 0:
                try:
                    artists[cluster].append(i)
                except KeyError:
                    artists[cluster] = [ i ]

        #print "Artists:"
        #for artist in artists.keys():
        #    print artistClusterEngine.getClusterTitle(artist).encode('utf-8', 'replace')

        # Now determine the most prominent names in the cluster and build the final cluster list
        for album in albums.keys():

            albumName = albumClusterEngine.getClusterTitle(album)

            artistHist = {}
            fileList = []
            for track in albums[album]:
                cluster = artistClusterEngine.getClusterFromId(tracks[track]['artistIndex'])
                fileList.append(tracks[track]['file'])
                try:
                    artistHist[cluster] += 1
                except KeyError:
                    artistHist[cluster] = 1

            if len(artistHist.keys()) == 1 and artistHist.keys()[0] == -1:
                artistName = u"Unknown"
            else:
                res = map(None, artistHist.values(), artistHist.keys())
                res.sort()
                res.reverse()
                artistName = artistClusterEngine.getClusterTitle(res[0][1])

            self.clusters.append(ClusterAlbum(self.context, artistName, albumName, fileList))
