如何在 Spark RDD (Java) 中按索引获取元素
2022-09-01 13:54:11
我知道rdd.firstwfirst()的方法,它给了我RDD中的第一个元素。
还有rdd.take(num)方法,它给了我第一个“num”元素。
但是,难道没有按索引获取元素的可能性吗?
谢谢
我知道rdd.firstwfirst()的方法,它给了我RDD中的第一个元素。
还有rdd.take(num)方法,它给了我第一个“num”元素。
但是,难道没有按索引获取元素的可能性吗?
谢谢
这应该可以通过首先为RDD编制索引来实现。转换提供稳定的索引,按其原始顺序对每个元素进行编号。zipWithIndex
鉴于:rdd = (a,b,c)
val withIndex = rdd.zipWithIndex // ((a,0),(b,1),(c,2))
若要按索引查找元素,此窗体没有用处。首先,我们需要使用索引作为键:
val indexKey = withIndex.map{case (k,v) => (v,k)} //((0,a),(1,b),(2,c))
现在,可以使用 PairRDD 中的操作按键查找元素:lookup
val b = indexKey.lookup(1) // Array(b)
如果您希望经常在同一RDD上使用,我建议缓存RDD以提高性能。lookup
indexKey
如何使用 Java API 执行此操作是留给读者的练习。
我尝试这个类按索引获取项目。首先,当您构造时,它会计算RDD的每个分区中的元素数。然后,当您调用 时,它仅在包含该索引的分区上运行作业。new IndexedFetcher(rdd, itemClass)
indexedFetcher.get(n)
请注意,我需要使用Java 1.7而不是1.8编译它;从 Spark 1.1.0 开始,com.esotericsoftware.reflectasm 中捆绑的 org.objectweb.asm 还不能读取 Java 1.8 类(当您尝试运行 Java 1.8 函数时,会抛出 IllegalStateException)。
import java.io.Serializable;
import org.apache.spark.SparkContext;
import org.apache.spark.TaskContext;
import org.apache.spark.rdd.RDD;
import scala.reflect.ClassTag;
public static class IndexedFetcher<E> implements Serializable {
private static final long serialVersionUID = 1L;
public final RDD<E> rdd;
public Integer[] elementsPerPartitions;
private Class<?> clazz;
public IndexedFetcher(RDD<E> rdd, Class<?> clazz){
this.rdd = rdd;
this.clazz = clazz;
SparkContext context = this.rdd.context();
ClassTag<Integer> intClassTag = scala.reflect.ClassTag$.MODULE$.<Integer>apply(Integer.class);
elementsPerPartitions = (Integer[]) context.<E, Integer>runJob(rdd, IndexedFetcher.<E>countFunction(), intClassTag);
}
public static class IteratorCountFunction<E> extends scala.runtime.AbstractFunction2<TaskContext, scala.collection.Iterator<E>, Integer> implements Serializable {
private static final long serialVersionUID = 1L;
@Override public Integer apply(TaskContext taskContext, scala.collection.Iterator<E> iterator) {
int count = 0;
while (iterator.hasNext()) {
count++;
iterator.next();
}
return count;
}
}
static <E> scala.Function2<TaskContext, scala.collection.Iterator<E>, Integer> countFunction() {
scala.Function2<TaskContext, scala.collection.Iterator<E>, Integer> function = new IteratorCountFunction<E>();
return function;
}
public E get(long index) {
long remaining = index;
long totalCount = 0;
for (int partition = 0; partition < elementsPerPartitions.length; partition++) {
if (remaining < elementsPerPartitions[partition]) {
return getWithinPartition(partition, remaining);
}
remaining -= elementsPerPartitions[partition];
totalCount += elementsPerPartitions[partition];
}
throw new IllegalArgumentException(String.format("Get %d within RDD that has only %d elements", index, totalCount));
}
public static class FetchWithinPartitionFunction<E> extends scala.runtime.AbstractFunction2<TaskContext, scala.collection.Iterator<E>, E> implements Serializable {
private static final long serialVersionUID = 1L;
private final long indexWithinPartition;
public FetchWithinPartitionFunction(long indexWithinPartition) {
this.indexWithinPartition = indexWithinPartition;
}
@Override public E apply(TaskContext taskContext, scala.collection.Iterator<E> iterator) {
int count = 0;
while (iterator.hasNext()) {
E element = iterator.next();
if (count == indexWithinPartition)
return element;
count++;
}
throw new IllegalArgumentException(String.format("Fetch %d within partition that has only %d elements", indexWithinPartition, count));
}
}
public E getWithinPartition(int partition, long indexWithinPartition) {
System.out.format("getWithinPartition(%d, %d)%n", partition, indexWithinPartition);
SparkContext context = rdd.context();
scala.Function2<TaskContext, scala.collection.Iterator<E>, E> function = new FetchWithinPartitionFunction<E>(indexWithinPartition);
scala.collection.Seq<Object> partitions = new scala.collection.mutable.WrappedArray.ofInt(new int[] {partition});
ClassTag<E> classTag = scala.reflect.ClassTag$.MODULE$.<E>apply(this.clazz);
E[] result = (E[]) context.<E, E>runJob(rdd, function, partitions, true, classTag);
return result[0];
}
}