hadoop版本:hadoop-2.6.0-cdh5.10.0
spark版本:2.4.5
jdk版本:1.8
sequoiadb版本:5.0.1(com.sequoiadb.sequoiadb-driver-5.0.1.jar)
sequoiadb-hadoop-connec版本:2.2(com.sequoiadb.hadoop-connector-2.2.jar)
【问题详细描述】
使用spark和hadoop-connector读取sdb的json实例失败,报错:
com.sequoiadb.exception.BaseException: SDB_NETWORK(-15): Network error, detail: failed to connect to vip-070:11820
at com.sequoiadb.net.TCPConnection.connect(TCPConnection.java:127)
at com.sequoiadb.base.Sequoiadb.init(Sequoiadb.java:482)
at com.sequoiadb.base.Sequoiadb.<init>(Sequoiadb.java:458)
at com.sequoiadb.base.Sequoiadb.<init>(Sequoiadb.java:467)
at com.sequoiadb.base.Sequoiadb.<init>(Sequoiadb.java:445)
at com.sequoiadb.hadoop.io.SequoiadbBlockReader.<init>(SequoiadbBlockReader.java:76)
at com.sequoiadb.hadoop.mapreduce.SequoiadbInputFormat.createRecordReader(SequoiadbInputFormat.java:60)
at org.apache.spark.rdd.NewHadoopRDD$$anon$1.liftedTree1$1(NewHadoopRDD.scala:197)
at org.apache.spark.rdd.NewHadoopRDD$$anon$1.<init>(NewHadoopRDD.scala:196)
at org.apache.spark.rdd.NewHadoopRDD.compute(NewHadoopRDD.scala:151)
at org.apache.spark.rdd.NewHadoopRDD.compute(NewHadoopRDD.scala:70)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:123)
at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.net.UnknownHostException: vip-070
at java.net.AbstractPlainSocketImpl.connect(AbstractPlainSocketImpl.java:184)
at java.net.SocksSocketImpl.connect(SocksSocketImpl.java:392)
at java.net.Socket.connect(Socket.java:607)
at com.sequoiadb.net.TCPConnection.connect(TCPConnection.java:119)
... 26 more
代码:
import org.apache.hadoop.conf.Configuration;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.cqvip.analyzer.tools.hdfs.DeleteHDFS;
import com.sequoiadb.hadoop.io.BSONWritable;
import com.sequoiadb.hadoop.mapreduce.SequoiadbInputFormat;
public class NormalExtract
{
private static Logger logger = LoggerFactory.getLogger(NormalExtract.class);
private static String sdbUri = "192.168.31.26:11810";
private static String collectionSpace = "datawarehouse_test";
private static String collectionName = "base_obj_meta_a";
private static String showPath = "/user/ganruoxun/test";
public static void main(String[] args)
{
SparkSession sparkSession = SparkSession
.builder()
.appName("NormalExtract")
.getOrCreate();
JavaSparkContext jsc = new JavaSparkContext(sparkSession.sparkContext());
jsc.setLogLevel("warn");
extract(jsc);
jsc.close();
}
public static void extract(JavaSparkContext jsc)
{
Configuration conf = jsc.hadoopConfiguration();
conf.set("sequoiadb.input.url", sdbUri);
conf.set("sequoiadb.in.collectionspace", collectionSpace);
conf.set("sequoiadb.in.collection", collectionName);
JavaPairRDD<Object, BSONWritable> sdbRDD = jsc
.newAPIHadoopRDD(conf, SequoiadbInputFormat.class, Object.class, BSONWritable.class);
if (DeleteHDFS.deleteDir(jsc, showPath))
{
sdbRDD.map(tuple -> tuple._1.toString() + "\t" + tuple._2.getBson().toString())
.saveAsTextFile(showPath);
}
}
}