Algorytm na losowe samplowanie z setu bez zwracania

0

Nudzi mi się to wymyślam sobie koło na nowo:

package com.task;

import com.google.common.collect.Streams;

import java.io.File;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Objects;
import java.util.Random;
import java.util.Set;
import java.util.function.Supplier;
import java.util.stream.Collectors;
import java.util.stream.Stream;

public class Task {

    interface SampleAlgorithm <T extends Collection<?>> {
        T sample(T collection, int sampleSize);
        String getName();
    }

    interface SetSampleAlgorithm extends SampleAlgorithm<Set<Integer>> {
        Set<Integer> sample(Set<Integer> set, int sampleSize);
        String getName();
    }

    static class ShuffleSampleAlg implements SetSampleAlgorithm {

        static final String NAME = "shuffle";

        @Override
        public Set<Integer> sample(Set<Integer> set, int sampleSize) {
            List<Integer> listFromSet = new ArrayList<>(set);
            Collections.shuffle(listFromSet);
            return listFromSet.stream().limit(sampleSize).collect(Collectors.toSet());
        }

        @Override
        public String getName() {
            return NAME;
        }
    }

    static class ArraySampleAlg implements SetSampleAlgorithm {

        static final String NAME = "array";

        @Override
        public Set<Integer> sample(Set<Integer> set, int sampleSize) {
            List<Integer> listFromSet = new ArrayList<>(set);
            Set<Integer> rndIdx = new Random().ints(sampleSize, 0, set.size()).boxed().collect(Collectors.toSet());
            return rndIdx.stream().map(i -> listFromSet.get(i)).collect(Collectors.toSet());
        }

        @Override
        public String getName() {
            return NAME;
        }
    }

    static class IterateSampleAlg implements SetSampleAlgorithm {

        static final String NAME = "iterate";

        @Override
        public Set<Integer> sample(Set<Integer> set, int sampleSize) {
            Set<Integer> rndIdx = new Random().ints().limit(sampleSize).boxed().collect(Collectors.toSet());
            return Streams.mapWithIndex(
                    set.stream(),
                    (elem, idx) -> rndIdx.contains(idx) ? elem : null
            )
                    .filter(Objects::nonNull)
                    .collect(Collectors.toSet());
        }

        @Override
        public String getName() {
            return NAME;
        }
    }

    public static void main(String[] args) {

        List<Integer> sampleSizes = List.of(9, 999, 99999, 999999);

        Set<Integer> set = Stream.iterate(0, i -> i + 1)
                .limit(9999999)
                .map(i -> new Random().nextInt())
                .collect(Collectors.toUnmodifiableSet());

        List<SampleAlgorithm<Set<Integer>>> algorithms = List.of(
                new ShuffleSampleAlg(),
                new ArraySampleAlg(),
                new IterateSampleAlg()
        );

        algorithms.forEach(alg -> sampleSizes.forEach(sampleSize -> sample(set, sampleSize, alg)));
    }

    private static void sample(Set<Integer> set, int sampleSize, SampleAlgorithm<Set<Integer>> alg) {
        sample(
                () -> alg.sample(set, sampleSize),
                alg.getName(),
                sampleSize
        );
    }

    private static void sample(Supplier<Set<Integer>> supp, String name, int sampleSize) {
        long start = System.currentTimeMillis();
        Set<Integer> sampledSet = supp.get();
        long stop = System.currentTimeMillis();
        System.err.println(name + " size: " + sampleSize + " score: " + (stop - start));

        // to make sampledList used to be sure JIT will not erase getting sampledList
        try {
            new PrintWriter(File.createTempFile("aaa", "bbb")).print(sampledSet);
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}

średnia z 5 wywołań:

wyniki.png

co ciekawe, gdy sample jest bliski całemu setowi samplowanemu szybsza okazuje się iteracja.
macie coś lepszego?

0

macie coś lepszego?

Tak, użyj JMH ;)

1 użytkowników online, w tym zalogowanych: 0, gości: 1