您现在的位置是:主页 > news > 新的龙岗网站建设/百度推广客户端下载

新的龙岗网站建设/百度推广客户端下载

admin2025/5/4 22:13:36news

简介新的龙岗网站建设,百度推广客户端下载,深圳网站建设icxun,丽江手机网站建设如果单词u属于单词w的窗口内,则认为(u,w)出现一次,这里的窗口可以定义为一个固定大小的窗口,或者是前后相连出现、在同一句中出现、在同一个段落中出现的单词,如果窗口中的单词为[w1,w2,w3],则发射((w1,w2),1)和((w1,w…

新的龙岗网站建设,百度推广客户端下载,深圳网站建设icxun,丽江手机网站建设如果单词u属于单词w的窗口内,则认为(u,w)出现一次,这里的窗口可以定义为一个固定大小的窗口,或者是前后相连出现、在同一句中出现、在同一个段落中出现的单词,如果窗口中的单词为[w1,w2,w3],则发射((w1,w2),1)和((w1,w…

如果单词u属于单词w的窗口内,则认为(u,w)出现一次,这里的窗口可以定义为一个固定大小的窗口,或者是前后相连出现、在同一句中出现、在同一个段落中出现的单词,如果窗口中的单词为[w1,w2,w3],则发射((w1,w2),1)和((w1,w3),1)出去,然后窗口向后移动一个单词。Reduce阶段则对发射过来的相同键的值进行简单的累加求和即可

设有一个英语语句:If you do not learn to think when you are young, you may never learn

在Map阶段,假设窗口大小为6个单词,那么窗口首先覆盖If you do not learn to,通过计算得到键值对((If,you),1),((If,do),1),((If,not),1),((If,learn),1),((If,to),1),然后发射出去。随后窗口向后滑动一个单词,窗口覆盖到you do not learn to think,通过计算得到键值对((you,do),1),((you,not),1),((you,learn),1),((you,to),1),((you,think),1),然后后发射出去.重复此过程直到文档尾部,滑动窗口则通过将窗口头部向后缩进来进行,直到窗口大小为2,则键值对为((never,learn),1)

 

自定义RecordReader,如代码1-1所示

代码1-1

package com.hadoop.mapreduce;import java.io.IOException;import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;public class WholeFileInputRecord extends RecordReader<Text, Text> {private boolean progress = false;private Text key = new Text();private Text value = new Text();private Configuration conf;private FileSplit fileSplit;private FSDataInputStream fis;public WholeFileInputRecord(FileSplit fileSplit, Configuration conf) {this.fileSplit = fileSplit;this.conf = conf;}@Overridepublic void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {// TODO Auto-generated method stubPath file = fileSplit.getPath();FileSystem fs = file.getFileSystem(conf);fis = fs.open(file);}@Overridepublic boolean nextKeyValue() throws IOException, InterruptedException {if (!progress) {byte[] content = new byte[(int) fileSplit.getLength()];Path file = fileSplit.getPath();key.set(file.getName());try {IOUtils.readFully(fis, content, 0, content.length);value.set(content);} catch (Exception e) {e.printStackTrace();} finally {IOUtils.closeStream(fis);}progress = true;return true;}return false;}@Overridepublic Text getCurrentKey() throws IOException, InterruptedException {return key;}@Overridepublic Text getCurrentValue() throws IOException, InterruptedException {return value;}@Overridepublic float getProgress() throws IOException, InterruptedException {return progress ? 1.0f : 0.0f;}@Overridepublic void close() throws IOException {}}

 

自定义FileInputFormat,如代码1-2所示

代码1-2

package com.hadoop.mapreduce;import java.io.IOException;import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;public class WholeFileInputFormat extends FileInputFormat<Text, Text> {@Overrideprotected boolean isSplitable(JobContext context, Path filename) {return false;}@Overridepublic RecordReader<Text, Text> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {// TODO Auto-generated method stubreturn new WholeFileInputRecord((FileSplit) split, context.getConfiguration());}}

 

定义一个WordPair类,用以保存同一个窗口出现的单词,如代码1-3

代码1-3

package com.hadoop.mapreduce;import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;import org.apache.hadoop.io.WritableComparable;public class WordPair implements WritableComparable<WordPair> {private String wordA;private String wordB;public WordPair() {}public WordPair(String wordA, String wordB) {this.wordA = wordA;this.wordB = wordB;}@Overridepublic void write(DataOutput out) throws IOException {out.writeUTF(wordA);out.writeUTF(wordB);}@Overridepublic void readFields(DataInput in) throws IOException {wordA = in.readUTF();wordB = in.readUTF();}@Overridepublic String toString() {return wordA + "," + wordB;}@Overridepublic int hashCode() {return (wordA.hashCode() + wordB.hashCode()) * 9;}@Overridepublic int compareTo(WordPair o) {if (equals(o)) {return 0;}return (wordA + wordB).compareTo(o.wordA + o.wordB);}@Overridepublic boolean equals(Object obj) {if (obj instanceof WordPair) {return false;}WordPair w = (WordPair) obj;if (wordA.equals(w.wordA) && wordB.equals(w.wordB)) {return true;}if (wordA.equals(w.wordB) && wordB.equals(w.wordA)) {return true;}return false;}}

 

代码1-4

package com.hadoop.mapreduce;import java.io.IOException;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.Queue;
import java.util.regex.Matcher;
import java.util.regex.Pattern;import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;public class WordConcurrnceMap extends Mapper<Text, Text, WordPair, IntWritable> {private int windowSize = 0;private static final String WORD_REGEX = "([a-zA-Z]{1,})";// 仅仅匹配由字母组成的简单英文单词private static final Pattern WORD_PATTERN = Pattern.compile(WORD_REGEX);// 用于识别英语单词(带连字符-)private Queue<String> queue = new LinkedList<String>();private static final IntWritable ONE = new IntWritable(1);protected void setup(Context context) throws java.io.IOException, InterruptedException {Configuration conf = context.getConfiguration();windowSize = conf.getInt("windowSize", 20);};protected void map(Text key, Text value, Context context) throws java.io.IOException, InterruptedException {System.out.println("value:" + value);System.out.println("windowSize:" + windowSize);Matcher matcher = WORD_PATTERN.matcher(value.toString());while (matcher.find()) {String word = matcher.group();queue.add(word);if (queue.size() >= windowSize) {wordPair(context);}}while (!(queue.size() <= 1)) {wordPair(context);}};private void wordPair(Context context) throws IOException, InterruptedException {Iterator<String> it = queue.iterator();String wordA = it.next();while (it.hasNext()) {String wordB = it.next();context.write(new WordPair(wordA, wordB), ONE);}queue.remove();}}

 

代码1-5

package com.hadoop.mapreduce;import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Reducer;public class WordConcurrnceReduce extends Reducer<WordPair, IntWritable, WordPair, IntWritable> {private IntWritable wordSum = new IntWritable();protected void reduce(WordPair key, Iterable<IntWritable> values, Context context) throws java.io.IOException, InterruptedException {int sum = 0;for (IntWritable val : values) {sum += val.get();}wordSum.set(sum);context.write(key, wordSum);};}

 

代码1-6

package com.hadoop.mapreduce;import java.io.IOException;import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;public class WordConcurrnce {public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {if (args == null || args.length != 3) {throw new RuntimeException("请输入输入路径、输出路径和窗口大小");}Configuration conf = new Configuration();conf.setInt("windowSize", Integer.parseInt(args[2]));Job job = Job.getInstance(conf);job.setJobName("WordConcurrnce");job.setJarByClass(WordConcurrnce.class);job.setMapperClass(WordConcurrnceMap.class);job.setMapOutputKeyClass(WordPair.class);job.setMapOutputValueClass(IntWritable.class);job.setReducerClass(WordConcurrnceReduce.class);job.setOutputKeyClass(WordPair.class);job.setOutputValueClass(IntWritable.class);job.setInputFormatClass(WholeFileInputFormat.class);job.setOutputFormatClass(TextOutputFormat.class);FileInputFormat.addInputPaths(job, args[0]);FileOutputFormat.setOutputPath(job, new Path(args[1]));System.exit(job.waitForCompletion(true) ? 0 : 1);}}

 

将text文件移到HDFS下的data文件下,如代码1-7

代码1-7

root@lejian:/data# cat text 
If you do not learn to think when you are young, you may never learn
root@lejian:/data# hadoop fs -put text /data
root@lejian:/data# hadoop fs -ls -R /data
-rw-r--r--   1 root supergroup         69 2017-01-12 20:41 /data/text

 

运行代码1-6,运行结果如代码1-8

代码1-8

root@lejian:/data# hadoop jar wordConcurrnce.jar com.hadoop.mapreduce.WordConcurrnce /data /output 10
…………
root@lejian:/data# hadoop fs -ls -R /output
-rw-r--r--   1 root supergroup          0 2017-01-12 20:55 /output/_SUCCESS
-rw-r--r--   1 root supergroup        848 2017-01-12 20:55 /output/part-r-00000
root@lejian:/data# hadoop fs -cat /output/part-r-00000
If,are  1
If,do   1
If,learn        1
…………
you,when        1
you,you 2
you,young       2

 

转载于:https://www.cnblogs.com/baoliyan/p/6279385.html