Commit b570a41b authored by YUSHIQIAN's avatar YUSHIQIAN

tw-join

parent e3542629
-- TODO: load the input dataset, located in ./local-input/OSN/tw.txt
--A = LOAD ...
-- You can specify parameters when executing the script by using the -p flag, and -f to point to your script
-- pig -f tw-join.pig -p input=/data/TWITTER/twitter_graph2.txt
-- For local testing with the default values, you'd just run it without any -p or -f flags:
-- pig -x local tw-join.pig
-- TODO: compute all the two-hop paths
--twohop = JOIN ...
-- Set default parallel
SET default_parallel 20;
-- TODO: project the twohop relation such that in output you display only the start and end nodes of the two hop path
--p_result = FOREACH ...
%default input './local-input/OSN/tw.txt'
-- TODO: make sure you avoid loops (e.g., if user 12 and 13 follow eachother)
--result = FILTER ...
datasetA = LOAD '$input' AS (id: long, fr: long);
STORE result INTO './local-output/OSN/twj/';
datasetB = LOAD '$input' AS (id: long, fr: long);
SPLIT datasetA INTO good_datasetA IF id is not null and fr is not null, bad_datasetA OTHERWISE;
SPLIT datasetB INTO good_datasetB IF id is not null and fr is not null, bad_datasetB OTHERWISE;
-- compute all the two-hop paths
twohop = JOIN good_datasetA by $1, good_datasetB by $0;
-- project the twohop relation such that in output you display only the start and end nodes of the two hop path
p_result = FOREACH twohop GENERATE $0,$3;
-- Remove duplicates
d_result = DISTINCT p_result;
-- make sure you avoid loops (e.g., if user 12 and 13 follow eachother)
result = FILTER d_result BY $0!=$1;
STORE result INTO '/output/OSN/twj/';
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment