1 Star 1 Fork 2

sunnyandgood / BigData

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
倒排索引.md 3.76 KB
一键复制 编辑 原始数据 按行查看 历史
sunnyandgood 提交于 2018-12-19 13:08 . Update 倒排索引.md

倒排索引

一、问题分析

二、实现代码

InversIndex类

package com.mr.ii.action;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class InversIndex {

  public static void main(String[] args) throws Exception {
    // TODO Auto-generated method stub
    Job job=Job.getInstance(new Configuration());

    job.setJarByClass(InversIndex.class);

    job.setMapperClass(InversMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    FileInputFormat.setInputPaths(job, new Path(args[0]));

    job.setReducerClass(InversReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setCombinerClass(InversConbiner.class);

    job.waitForCompletion(true);
  }


  public static class InversMapper extends Mapper<LongWritable, Text, Text, Text>{

    private Text k2=new Text();
    private Text v2=new Text();

    @Override
    protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, Text>.Context context)
        throws IOException, InterruptedException {
      // TODO Auto-generated method stub
      String hang=value.toString();
      String[] values=hang.split("\t");

      for(String string : values){
        FileSplit in=(FileSplit) context.getInputSplit();
        Path path=in.getPath();
        String fileName=path.getName();

        k2.set(string+"->"+ fileName);
        v2.set("1");
        context.write(k2, v2);
      }
    }
  }

  public static class InversConbiner extends Reducer<Text, Text, Text, Text>{

    private Text k22=new Text();
    private Text v22=new Text();
    @Override
    protected void reduce(Text k2, Iterable<Text> v2, Reducer<Text, Text, Text, Text>.Context context)
        throws IOException, InterruptedException {
      String keyAndName = k2.toString();
      String[] strings=keyAndName.split("->");
      String key = strings[0];
      String fileName = strings[1];

      long sum = 0;

      for(Text text : v2){
        sum += Long.parseLong(text.toString());
      }
      k22.set(key);
      v22.set(fileName +"->"+ sum);

      context.write(k22, v22);
    }
  }

  public static class InversReducer extends Reducer<Text, Text, Text, Text>{

    private Text v3=new Text();
    @Override
    protected void reduce(Text k2, Iterable<Text> v2, Reducer<Text, Text, Text, Text>.Context context)
        throws IOException, InterruptedException {
      String sum ="";
      for(Text text : v2){
        sum += text.toString() + "\t";
      }

      v3.set(sum);
      context.write(k2, v3);
    }
  }
}

三、数据

a.txt

  hello	tom
  hello	kitty
  hello	jerry
  hello	tom

b.txt

  hello	cat
  hello	tom
  hello	kitty

c.txt

  hello	tom
  cat	kitty

四、执行步骤

  hadoop jar /ii.jar com.wxkj.ii.action.InversIndex /data /outdata

五、执行结果

  [root@hadoop01 tmp]# hdfs dfs -cat /outdata/part-r-00000
  cat	c.txt->1	b.txt->1	
  hello	b.txt->3	c.txt->1	a.txt->4	
  jerry	a.txt->1	
  kitty	a.txt->1	b.txt->1	c.txt->1	
  tom	c.txt->1	b.txt->1	a.txt->2	
Java
1
https://gitee.com/sunnyandgood/BigData.git
git@gitee.com:sunnyandgood/BigData.git
sunnyandgood
BigData
BigData
master

搜索帮助