【Todo】找出共同好友 & Spark & Hadoop面試題

本文轉載自查看原文 2016-12-02 16:41 1455 ToDo/ Hadoop&Spark&數據處理/ 機器學習&數據挖掘/ 面試題分析

找了這篇文章看了一下面試題<Spark 和hadoop的一些面試題（准備）>

http://blog.csdn.net/qiezikuaichuan/article/details/51578743

其中有一道題目很不錯，詳見：

http://www.aboutyun.com/thread-18826-1-1.html

http://www.cnblogs.com/lucius/p/3483494.html

我覺得可以在Hadoop上面實際編程做一下。

我覺得第一篇文章里面下面這一段總結的很好：

簡要描述你知道的數據挖掘算法和使用場景

（一）基於分類模型的案例

（1）垃圾郵件的判別通常會采用朴素貝葉斯的方法進行判別

（2）醫學上的腫瘤判斷通過分類模型識別

（二）基於預測模型的案例

（1）紅酒品質的判斷分類回歸樹模型進行預測和判斷紅酒的品質

（2）搜索引擎的搜索量和股價波動

（三）基於關聯分析的案例：沃爾瑪的啤酒尿布

（四）基於聚類分析的案例：零售客戶細分

（五）基於異常值分析的案例：支付中的交易欺詐偵測

（六）基於協同過濾的案例：電商猜你喜歡和推薦引擎

（七）基於社會網絡分析的案例：電信中的種子客戶

（八）基於文本分析的案例

（1）字符識別：掃描王APP

（2）文學著作與統計：紅樓夢歸屬

上面的統計共同好友的題目。寫了個程序試了一下。

在Intellij項目 HadoopProj里面。maven項目，依賴如下：

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>com.hadoop.my</groupId>
    <artifactId>hadoop-proj</artifactId>
    <version>1.0-SNAPSHOT</version>

    <dependencies>

        <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-common -->
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-common</artifactId>
            <version>2.7.3</version>
        </dependency>

        <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-client -->
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
            <version>2.7.3</version>
        </dependency>

        <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-hdfs -->
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-hdfs</artifactId>
            <version>2.7.3</version>
        </dependency>

    </dependencies>

    <repositories>
        <repository>
            <id>aliyunmaven</id>
            <url>http://maven.aliyun.com/nexus/content/groups/public/</url>
        </repository>
    </repositories>

</project>

代碼如下：

package com.hadoop.my;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

/**
 * Created by baidu on 16/12/3.
 */
public class HadoopProj {
    public static class CommonFriendsMapper extends Mapper<LongWritable, Text, Text, Text> {

        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String line = value.toString();
            String[] split = line.split(":");
            String person = split[0];
            String[] friends = split[1].split(",");

            for (String f: friends) {
                context.write(new Text(f), new Text(person));
            }
        }

    }

    public static class CommonFriendsReducer extends Reducer<Text, Text, Text, Text> {
        // 輸入<B->A><B->E><B->F>....
        // 輸出 B A,E,F,J
        protected void reduce(Text friend, Iterable<Text> persons, Context context) throws IOException, InterruptedException {
            StringBuffer sb = new StringBuffer();

            for (Text person: persons) {
                sb.append(person+",");
            }

            context.write(friend, new Text(sb.toString()));
        }
    }

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        //讀取classpath下的所有xxx-site.xml配置文件，並進行解析
        Configuration conf = new Configuration();

        Job friendJob = Job.getInstance(conf);

        //通過主類的類加載器機制獲取到本job的所有代碼所在的jar包
        friendJob.setJarByClass(HadoopProj.class);
        //指定本job使用的mapper類
        friendJob.setMapperClass(CommonFriendsMapper.class);
        //指定本job使用的reducer類
        friendJob.setReducerClass(CommonFriendsReducer.class);

        //指定reducer輸出的kv數據類型
        friendJob.setOutputKeyClass(Text.class);
        friendJob.setOutputValueClass(Text.class);

        //指定本job要處理的文件所在的路徑
        FileInputFormat.setInputPaths(friendJob, new Path(args[0]));
        //指定本job輸出的結果文件放在哪個路徑
        FileOutputFormat.setOutputPath(friendJob, new Path(args[1]));

        //將本job向hadoop集群提交執行
        boolean res = friendJob.waitForCompletion(true);

        System.exit(res?0:1);

    }

}

打成Jar包之后，傳到Hadoop機器m42n05上面。

在上面還要新建輸入文件，內容：

A:B,C,D,F,E,O
B:A,C,E,K
C:F,A,D,I
D:A,E,F,L
E:B,C,D,M,L
F:A,B,C,D,E,O,M
G:A,C,D,E,F
H:A,C,D,E,O
I:A,O
J:B,O
K:A,C,D
L:D,E,F
M:E,F,G
O:A,H,I,J

命令：

$ hadoop fs -mkdir /input/frienddata

$ hadoop fs -put text.txt /input/frienddata

$ hadoop fs -ls /input/frienddata
Found 1 items
-rw-r--r--   3 work supergroup        142 2016-12-03 17:12 /input/frienddata/text.txt

把hadoop-proj.jar 拷貝到 m42n05的/home/work/data/installed/hadoop-2.7.3/myjars

運行命令

$ hadoop jar /home/work/data/installed/hadoop-2.7.3/myjars/hadoop-proj.jar com.hadoop.my.HadoopProj /input/frienddata /output/friddata

報錯：

$ hadoop jar /home/work/data/installed/hadoop-2.7.3/myjars/hadoop-proj.jar com.hadoop.my.HadoopProj /input/frienddata /outputtput/frienddata
16/12/03 17:19:52 INFO client.RMProxy: Connecting to ResourceManager at master.Hadoop/10.117.146.12:8032                                                                             /fri
Exception in thread "main" org.apache.hadoop.mapred.FileAlreadyExistsException: Output directory hdfs://master.Hadoop:8390/input/frienddata already exists

看起來是命令行后面參數的索引不對，注意代碼里是這樣寫的。

//指定本job要處理的文件所在的路徑
FileInputFormat.setInputPaths(friendJob, new Path(args[0]));
//指定本job輸出的結果文件放在哪個路徑
FileOutputFormat.setOutputPath(friendJob, new Path(args[1]));

而Java里面，和C++不同，參數的確是從0開始的。程序名本身不占位。

所以可能是不需要輸入類名。重新輸入命令：

$ hadoop jar /home/work/data/installed/hadoop-2.7.3/myjars/hadoop-proj.jar /input/frienddata /output/frienddata

獲得輸出：

16/12/03 17:24:33 INFO client.RMProxy: Connecting to ResourceManager at master.Hadoop/10.117.146.12:8032
16/12/03 17:24:33 WARN mapreduce.JobResourceUploader: Hadoop command-line option parsing not performed. Implement the Tool interface and execute your application with ToolRunner to remedy this.
16/12/03 17:24:34 INFO input.FileInputFormat: Total input paths to process : 1
16/12/03 17:24:34 INFO mapreduce.JobSubmitter: number of splits:1
16/12/03 17:24:34 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1478254572601_0002
16/12/03 17:24:34 INFO impl.YarnClientImpl: Submitted application application_1478254572601_0002
16/12/03 17:24:34 INFO mapreduce.Job: The url to track the job: http://master.Hadoop:8320/proxy/application_1478254572601_0002/
16/12/03 17:24:34 INFO mapreduce.Job: Running job: job_1478254572601_0002
16/12/03 17:24:40 INFO mapreduce.Job: Job job_1478254572601_0002 running in uber mode : false
16/12/03 17:24:40 INFO mapreduce.Job:  map 0% reduce 0%
16/12/03 17:24:45 INFO mapreduce.Job:  map 100% reduce 0%
16/12/03 17:24:49 INFO mapreduce.Job:  map 100% reduce 100%
16/12/03 17:24:50 INFO mapreduce.Job: Job job_1478254572601_0002 completed successfully
16/12/03 17:24:50 INFO mapreduce.Job: Counters: 49
        File System Counters
                FILE: Number of bytes read=348
                FILE: Number of bytes written=238531
                FILE: Number of read operations=0
                FILE: Number of large read operations=0
                FILE: Number of write operations=0
                HDFS: Number of bytes read=258
                HDFS: Number of bytes written=156
                HDFS: Number of read operations=6
                HDFS: Number of large read operations=0
                HDFS: Number of write operations=2
        Job Counters 
                Launched map tasks=1
                Launched reduce tasks=1
                Data-local map tasks=1
                Total time spent by all maps in occupied slots (ms)=2651
                Total time spent by all reduces in occupied slots (ms)=2446
                Total time spent by all map tasks (ms)=2651
                Total time spent by all reduce tasks (ms)=2446
                Total vcore-milliseconds taken by all map tasks=2651
                Total vcore-milliseconds taken by all reduce tasks=2446
                Total megabyte-milliseconds taken by all map tasks=2714624
                Total megabyte-milliseconds taken by all reduce tasks=2504704
        Map-Reduce Framework
                Map input records=14
                Map output records=57
                Map output bytes=228
                Map output materialized bytes=348
                Input split bytes=116
                Combine input records=0
                Combine output records=0
                Reduce input groups=14
                Reduce shuffle bytes=348
                Reduce input records=57
                Reduce output records=14
                Spilled Records=114
                Shuffled Maps =1
                Failed Shuffles=0
                Merged Map outputs=1
                GC time elapsed (ms)=111
                CPU time spent (ms)=1850
                Physical memory (bytes) snapshot=455831552
                Virtual memory (bytes) snapshot=4239388672
                Total committed heap usage (bytes)=342360064
        Shuffle Errors
                BAD_ID=0
                CONNECTION=0
                IO_ERROR=0
                WRONG_LENGTH=0
                WRONG_MAP=0
                WRONG_REDUCE=0
        File Input Format Counters 
                Bytes Read=142
        File Output Format Counters 
                Bytes Written=156

看一下輸出文件的位置：

$ hadoop fs -ls /output/frienddata
Found 2 items
-rw-r--r--   3 work supergroup          0 2016-12-03 17:24 /output/frienddata/_SUCCESS
-rw-r--r--   3 work supergroup        156 2016-12-03 17:24 /output/frienddata/part-r-00000


$ hadoop fs -cat /output/frienddata/part-r-00000
A       I,K,C,B,G,F,H,O,D,
B       A,F,J,E,
C       A,E,B,H,F,G,K,
D       G,C,K,A,L,F,E,H,
E       G,M,L,H,A,F,B,D,
F       L,M,D,C,G,A,
G       M,
H       O,
I       O,C,
J       O,
K       B,
L       D,E,
M       E,F,
O       A,H,I,J,F,

當然，也可以把輸出merge到本地文件：

$ hdfs dfs -getmerge hdfs://master.Hadoop:8390/output/frienddata /home/work/frienddatatmp

$ cat frienddatatmp 
A       I,K,C,B,G,F,H,O,D,
B       A,F,J,E,
C       A,E,B,H,F,G,K,
D       G,C,K,A,L,F,E,H,
E       G,M,L,H,A,F,B,D,
F       L,M,D,C,G,A,
G       M,
H       O,
I       O,C,
J       O,
K       B,
L       D,E,
M       E,F,
O       A,H,I,J,F,

上面這道題目，做完了。

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 Hadoop案例（三）找博客共同好友 (mapreduce題) 找出有共同好友的 users --好好玩 mapreduce 查找共同好友 hadoop/spark面試題利用Python查看微信共同好友 python版mapreduce題目實現尋找共同好友 Spark面試題 Spark面試題(一) Spark面試題(二) Spark面試題（四）