TopKSelector.java

/*
 * Copyright (C) 2014 The Guava Authors
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.google.common.collect;

import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkNotNull;

import com.google.common.annotations.GwtCompatible;
import com.google.common.math.IntMath;
import java.math.RoundingMode;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
import java.util.stream.Stream;
import org.checkerframework.checker.nullness.qual.Nullable;

/**
 * An accumulator that selects the "top" {@code k} elements added to it, relative to a provided
 * comparator. "Top" can mean the greatest or the lowest elements, specified in the factory used to
 * create the {@code TopKSelector} instance.
 *
 * <p>If your input data is available as a {@link Stream}, prefer passing {@link
 * Comparators#least(int)} to {@link Stream#collect(java.util.stream.Collector)}. If it is available
 * as an {@link Iterable} or {@link Iterator}, prefer {@link Ordering#leastOf(Iterable, int)}.
 *
 * <p>This uses the same efficient implementation as {@link Ordering#leastOf(Iterable, int)},
 * offering expected O(n + k log k) performance (worst case O(n log k)) for n calls to {@link
 * #offer} and a call to {@link #topK}, with O(k) memory. In comparison, quickselect has the same
 * asymptotics but requires O(n) memory, and a {@code PriorityQueue} implementation takes O(n log
 * k). In benchmarks, this implementation performs at least as well as either implementation, and
 * degrades more gracefully for worst-case input.
 *
 * <p>The implementation does not necessarily use a <i>stable</i> sorting algorithm; when multiple
 * equivalent elements are added to it, it is undefined which will come first in the output.
 *
 * @author Louis Wasserman
 */
@GwtCompatible
final class TopKSelector<T> {

  /**
   * Returns a {@code TopKSelector} that collects the lowest {@code k} elements added to it,
   * relative to the natural ordering of the elements, and returns them via {@link #topK} in
   * ascending order.
   *
   * @throws IllegalArgumentException if {@code k < 0} or {@code k > Integer.MAX_VALUE / 2}
   */
  public static <T extends Comparable<? super T>> TopKSelector<T> least(int k) {
    return least(k, Ordering.natural());
  }

  /**
   * Returns a {@code TopKSelector} that collects the lowest {@code k} elements added to it,
   * relative to the specified comparator, and returns them via {@link #topK} in ascending order.
   *
   * @throws IllegalArgumentException if {@code k < 0} or {@code k > Integer.MAX_VALUE / 2}
   */
  public static <T> TopKSelector<T> least(int k, Comparator<? super T> comparator) {
    return new TopKSelector<T>(comparator, k);
  }

  /**
   * Returns a {@code TopKSelector} that collects the greatest {@code k} elements added to it,
   * relative to the natural ordering of the elements, and returns them via {@link #topK} in
   * descending order.
   *
   * @throws IllegalArgumentException if {@code k < 0} or {@code k > Integer.MAX_VALUE / 2}
   */
  public static <T extends Comparable<? super T>> TopKSelector<T> greatest(int k) {
    return greatest(k, Ordering.natural());
  }

  /**
   * Returns a {@code TopKSelector} that collects the greatest {@code k} elements added to it,
   * relative to the specified comparator, and returns them via {@link #topK} in descending order.
   *
   * @throws IllegalArgumentException if {@code k < 0} or {@code k > Integer.MAX_VALUE / 2}
   */
  public static <T> TopKSelector<T> greatest(int k, Comparator<? super T> comparator) {
    return new TopKSelector<T>(Ordering.from(comparator).reverse(), k);
  }

  private final int k;
  private final Comparator<? super T> comparator;

  /*
   * We are currently considering the elements in buffer in the range [0, bufferSize) as candidates
   * for the top k elements. Whenever the buffer is filled, we quickselect the top k elements to the
   * range [0, k) and ignore the remaining elements.
   */
  private final T[] buffer;
  private int bufferSize;

  /**
   * The largest of the lowest k elements we've seen so far relative to this comparator. If
   * bufferSize ≥ k, then we can ignore any elements greater than this value.
   */
  private @Nullable T threshold;

  private TopKSelector(Comparator<? super T> comparator, int k) {
    this.comparator = checkNotNull(comparator, "comparator");
    this.k = k;
    checkArgument(k >= 0, "k (%s) must be >= 0", k);
    checkArgument(k <= Integer.MAX_VALUE / 2, "k (%s) must be <= Integer.MAX_VALUE / 2", k);
    this.buffer = (T[]) new Object[IntMath.checkedMultiply(k, 2)];
    this.bufferSize = 0;
    this.threshold = null;
  }

  /**
   * Adds {@code elem} as a candidate for the top {@code k} elements. This operation takes amortized
   * O(1) time.
   */
  public void offer(@Nullable T elem) {
    if (k == 0) {
      return;
    } else if (bufferSize == 0) {
      buffer[0] = elem;
      threshold = elem;
      bufferSize = 1;
    } else if (bufferSize < k) {
      buffer[bufferSize++] = elem;
      if (comparator.compare(elem, threshold) > 0) {
        threshold = elem;
      }
    } else if (comparator.compare(elem, threshold) < 0) {
      // Otherwise, we can ignore elem; we've seen k better elements.
      buffer[bufferSize++] = elem;
      if (bufferSize == 2 * k) {
        trim();
      }
    }
  }

  /**
   * Quickselects the top k elements from the 2k elements in the buffer. O(k) expected time, O(k log
   * k) worst case.
   */
  private void trim() {
    int left = 0;
    int right = 2 * k - 1;

    int minThresholdPosition = 0;
    // The leftmost position at which the greatest of the k lower elements
    // -- the new value of threshold -- might be found.

    int iterations = 0;
    int maxIterations = IntMath.log2(right - left, RoundingMode.CEILING) * 3;
    while (left < right) {
      int pivotIndex = (left + right + 1) >>> 1;

      int pivotNewIndex = partition(left, right, pivotIndex);

      if (pivotNewIndex > k) {
        right = pivotNewIndex - 1;
      } else if (pivotNewIndex < k) {
        left = Math.max(pivotNewIndex, left + 1);
        minThresholdPosition = pivotNewIndex;
      } else {
        break;
      }
      iterations++;
      if (iterations >= maxIterations) {
        // We've already taken O(k log k), let's make sure we don't take longer than O(k log k).
        Arrays.sort(buffer, left, right, comparator);
        break;
      }
    }
    bufferSize = k;

    threshold = buffer[minThresholdPosition];
    for (int i = minThresholdPosition + 1; i < k; i++) {
      if (comparator.compare(buffer[i], threshold) > 0) {
        threshold = buffer[i];
      }
    }
  }

  /**
   * Partitions the contents of buffer in the range [left, right] around the pivot element
   * previously stored in buffer[pivotValue]. Returns the new index of the pivot element,
   * pivotNewIndex, so that everything in [left, pivotNewIndex] is ≤ pivotValue and everything in
   * (pivotNewIndex, right] is greater than pivotValue.
   */
  private int partition(int left, int right, int pivotIndex) {
    T pivotValue = buffer[pivotIndex];
    buffer[pivotIndex] = buffer[right];

    int pivotNewIndex = left;
    for (int i = left; i < right; i++) {
      if (comparator.compare(buffer[i], pivotValue) < 0) {
        swap(pivotNewIndex, i);
        pivotNewIndex++;
      }
    }
    buffer[right] = buffer[pivotNewIndex];
    buffer[pivotNewIndex] = pivotValue;
    return pivotNewIndex;
  }

  private void swap(int i, int j) {
    T tmp = buffer[i];
    buffer[i] = buffer[j];
    buffer[j] = tmp;
  }

  TopKSelector<T> combine(TopKSelector<T> other) {
    for (int i = 0; i < other.bufferSize; i++) {
      this.offer(other.buffer[i]);
    }
    return this;
  }

  /**
   * Adds each member of {@code elements} as a candidate for the top {@code k} elements. This
   * operation takes amortized linear time in the length of {@code elements}.
   *
   * <p>If all input data to this {@code TopKSelector} is in a single {@code Iterable}, prefer
   * {@link Ordering#leastOf(Iterable, int)}, which provides a simpler API for that use case.
   */
  public void offerAll(Iterable<? extends T> elements) {
    offerAll(elements.iterator());
  }

  /**
   * Adds each member of {@code elements} as a candidate for the top {@code k} elements. This
   * operation takes amortized linear time in the length of {@code elements}. The iterator is
   * consumed after this operation completes.
   *
   * <p>If all input data to this {@code TopKSelector} is in a single {@code Iterator}, prefer
   * {@link Ordering#leastOf(Iterator, int)}, which provides a simpler API for that use case.
   */
  public void offerAll(Iterator<? extends T> elements) {
    while (elements.hasNext()) {
      offer(elements.next());
    }
  }

  /**
   * Returns the top {@code k} elements offered to this {@code TopKSelector}, or all elements if
   * fewer than {@code k} have been offered, in the order specified by the factory used to create
   * this {@code TopKSelector}.
   *
   * <p>The returned list is an unmodifiable copy and will not be affected by further changes to
   * this {@code TopKSelector}. This method returns in O(k log k) time.
   */
  public List<T> topK() {
    Arrays.sort(buffer, 0, bufferSize, comparator);
    if (bufferSize > k) {
      Arrays.fill(buffer, k, buffer.length, null);
      bufferSize = k;
      threshold = buffer[k - 1];
    }
    // we have to support null elements, so no ImmutableList for us
    return Collections.unmodifiableList(Arrays.asList(Arrays.copyOf(buffer, bufferSize)));
  }
}