class Smoothing: ngramLM = None ngramModel = None order = None counts = None _defParameters = None def SetNgramLM(self, ngramLM, order, counts): self.ngramLM = ngramLM self.ngramModel = ngramLM.ngramModel self.order = order self.counts = counts def GetDefaultParameters(self): return self._defParameters def GetParameterBounds(self): return [] def IsOutOfRange(self, params): return False # Derived Methods: # def ComputeModel(self, outProbs, outBows, params, backoffProbs): None ############################################################################### class ModKneserNeySmoothing(Smoothing): # Inherited Variables: # ngramLM = None # ngramModel = None # order = None # counts = None # _defParameters = None effCounts = None histories = None backoffs = None invHistCounts = None bosIndex = None # Inherited Methods: # def GetDefaultParameters(self): return self._defParameters def __init__(self, discountOrder=3): self.discountOrder = discountOrder self.discFactors = numpy.zeros(discountOrder + 1) def SetNgramLM(self, ngramLM, order, counts): # Initialize member variables and precompute intermediate values. Smoothing.SetNgramLM(self, ngramLM, order, counts) ngramModel = ngramLM.ngramModel self.histories = ngramModel.histories[order] self.backoffs = ngramModel.backoffs[order] if self.order == 1: self.bosIndex = ngramModel.GetNgramList(1)._Find( 0, Vocab.BeginOfSentence) self._ComputeEffectiveCounts() self._ComputeInvHistCounts() self._ComputeDiscountingFactors() def GetParameterBounds(self): # Returns (min, max) for each parameter. return [(0, i) for i in self._paramIndices] def IsOutOfRange(self, params): # Returns if any of the parameters are out of range. return numpy.any(params < 0) or numpy.any(params > self._paramIndices) def ComputeModel(self, outProbs, outBows, params, backoffProbs): # Compute model probabilities and backoff weights. discounts = outProbs # Use outProbs as temp buffer self._ComputeDiscounts(discounts, params) self._ComputeBackoffWeights(outBows, discounts) self._ComputeInterpProbs(discounts, outBows, backoffProbs, outProbs) def _ComputeEffectiveCounts(self): # Compute Kneser-Ney (left branch) counts for lower orders. if self.order < self.ngramModel.order: # Only use n-grams that appear at least once for left branch count. self.effCounts = numpy.zeros(len(self.counts), dtype=numpy.uint32) Utils.BinCountNonZeroWeightUInt32UInt32( self.ngramModel.backoffs[self.order+1], self.ngramLM.countsList[self.order+1], self.effCounts) # For n-grams without left context (i.e. words starting with ), # use original count. self.effCounts[self.effCounts==0] = self.counts[self.effCounts==0] if self.bosIndex != None: self.effCounts[self.bosIndex] = 0 else: self.effCounts = self.counts self.numNonZeroEffCounts = (self.effCounts != 0).sum() def _ComputeInvHistCounts(self): # Precompute inverse of total effective counts for each history. self.invHistCounts = numpy.zeros( len(self.ngramModel.histories[self.order-1]), dtype=numpy.uint32) assert(len(self.histories) == len(self.effCounts)) Utils.BinCountWeightUInt32UInt32(self.histories, self.effCounts, self.invHistCounts) self.invHistCounts = 1.0 / self.invHistCounts def _ComputeDiscountingFactors(self): # Compute fixed modified Kneser-Ney discount factors based on counts. n = numpy.zeros(self.discountOrder + 2, dtype=numpy.uint32) Utils.BinCountUInt32UInt32(self.effCounts, n) n[0] = 0 # Ignore # Compute discount factors for parameters associated with ngrams. Y = float(n[1]) / (n[1] + 2*n[2]) self._paramIndices = self._ComputeRelevantParams(n) discFactors = numpy.array( [(i - (i+1)*Y*n[i+1]/n[i]) for i in self._paramIndices], dtype=numpy.float64) # Force discount factors estimated by fixed formula to be within range. discFactors[numpy.isnan(discFactors)] = 1e-5 self._defParameters = numpy.clip(discFactors, 1e-5, self._paramIndices) def _ComputeRelevantParams(self, n): # Compute the indices of the parameter that has a non-zero count. # Since the last parameter is used for all n-grams with higher counts, # we will always include the index for the last parameter. n[-2] += 1 # Force last parameter to be always included. paramIndices = numpy.arange(len(n) - 1)[n[:-1] != 0] n[-2] -= 1 return paramIndices def _ComputeDiscounts(self, outDiscounts, params): # Compute modified Kneser-Ney discounts for each n-gram. self.discFactors[self._paramIndices] = params Utils.BinLookupClipUInt32Float64(self.effCounts, self.discFactors, outDiscounts) def _ComputeBackoffWeights(self, outBows, discounts): # Compute backoff weights (gamma) assigned with each n-gram history. outBows[:] = 0 Utils.BinCountWeightUInt32Float64(self.histories, discounts, outBows) outBows *= self.invHistCounts outBows[numpy.isnan(outBows)] = 1.0 # Define 0/0 = 1 for backoffs def _ComputeInterpProbs(self, discounts, bows, backoffProbs, outProbs): # Compute interpolated probabilities. if (self.order == 1): # Assign 0 probability to n-grams with 0 counts. # Non-zero count unigrams backoff to uniform distribution. backoffProbs = numpy.array([1.0 / self.numNonZeroEffCounts]) Algorithms.ComputeInterpolatedProbabilities(self.effCounts, discounts, self.histories, self.backoffs, self.invHistCounts, bows, backoffProbs, False, outProbs) else: # For higher order, assign backoff probability to these n-grams. Algorithms.ComputeInterpolatedProbabilities(self.effCounts, discounts, self.histories, self.backoffs, self.invHistCounts, bows, backoffProbs, True, outProbs)