Answers Pig

  • Filter the speedtest conducted in Barcelona or Madrid. Then list the internet providers working in those cities.
NeubotTests = LOAD 'NeubotTests' using PigStorage(';') as (
                  client_address: chararray,
                  client_country: chararray,
                  lon: float,
                  lat: float,
                  client_provider: chararray,
                  mlabservername:  chararray,
                  connect_time:    float,
                  download_speed:  float,
                  neubot_version:  float,
                  platform:        chararray,
                  remote_address:  chararray,
                  test_name:       chararray,
                  timestamp:       long,
                  upload_speed:    float,
                  latency:  float,
                  uuid:     chararray,
                  asnum:    chararray,
                  region:   chararray,
                  city:     chararray,
                  hour:     int,
                  month:    int,
                  year:     int,
                  weekday:  int,
                  day:      int,
                  filedate: chararray
);

--
-- A1: Internet Providers in 'Barcelona' or 'Madrid' where speedtests were conducted
--

SpeedTests = FILTER @ BY (test_name matches '.*speedtest.*');

SpeedTests = FILTER @ BY (
    city matches '.*Barcelona.*' OR
    city matches '.*Madrid.*'
);

Providers = FOREACH @ GENERATE city, client_provider;
Providers = DISTINCT @;

DUMP @;
  • List the names and the IP ranges of the internet providers located in Barcelona. For this you need to use the IPtoNumber user defined function (cf. NeubotTestsUDFs.jar).
NeubotTests = LOAD 'NeubotTests' using PigStorage(';') as (
                  client_address: chararray,
                  client_country: chararray,
                  lon: float,
                  lat: float,
                  client_provider: chararray,
                  mlabservername:  chararray,
                  connect_time:    float,
                  download_speed:  float,
                  neubot_version:  float,
                  platform:        chararray,
                  remote_address:  chararray,
                  test_name:       chararray,
                  timestamp:       long,
                  upload_speed:    float,
                  latency:  float,
                  uuid:     chararray,
                  asnum:    chararray,
                  region:   chararray,
                  city:     chararray,
                  hour:     int,
                  month:    int,
                  year:     int,
                  weekday:  int,
                  day:      int,
                  filedate: chararray
);

--
-- A2: Internet Providers in Barcelona and their IP range based on the speedtests observations
--

SpeedTests = FILTER @ BY (test_name matches '.*speedtest.*');

SpeedTests = FILTER @ BY (
    city matches '.*Barcelona.*'
);

Providers = FOREACH @ GENERATE
    city,
    client_provider,
    IPtoNumber(client_address) AS ip
;

Providers = GROUP @ BY client_provider;

Providers_IP_Range = FOREACH @ GENERATE
    group,
    NumberToIP( MIN(Providers.ip) ),
    NumberToIP( MAX(Providers.ip) )
;

DUMP @;
  • Group the speedtest based on the user network infrastructure (e.g., 3G/4G vs ADSL). For this you can assume some max bandwidth (e.g., 21Mb/sec for ADSL).
NeubotTests = LOAD 'NeubotTests' using PigStorage(';') as (
                  client_address: chararray,
                  client_country: chararray,
                  lon: float,
                  lat: float,
                  client_provider: chararray,
                  mlabservername:  chararray,
                  connect_time:    float,
                  download_speed:  float,
                  neubot_version:  float,
                  platform:        chararray,
                  remote_address:  chararray,
                  test_name:       chararray,
                  timestamp:       long,
                  upload_speed:    float,
                  latency:  float,
                  uuid:     chararray,
                  asnum:    chararray,
                  region:   chararray,
                  city:     chararray,
                  hour:     int,
                  month:    int,
                  year:     int,
                  weekday:  int,
                  day:      int,
                  filedate: chararray
);

--
-- A3: Speedtests (conducted in Barcelona) organized by network type: Mobile vs ADSL
--

SpeedTests = FILTER @ BY (test_name matches '.*speedtest.*');
SpeedTests = FILTER @ BY (
    city matches '.*Barcelona.*'
);

SPLIT @ INTO
    Mobile_Tests IF (
        download_speed > 21000000 -- 21 Mb / sec
    ),

    ADSL_Tests IF (
        download_speed <= 21000000 -- 21 Mb / sec
    )
;

MobileSpeeds = FOREACH Mobile_Tests GENERATE
     CEIL(download_speed / 1000000) AS download_speed,
     'mobile' AS network_type: chararray
;

ADSLSpeeds = FOREACH ADSL_Tests GENERATE
     CEIL(download_speed / 1000000) AS download_speed,
     'adsl' AS network_type: chararray
;

Speeds = UNION MobileSpeeds, ADSLSpeeds;
Speeds = GROUP @ BY (download_speed, network_type);
Speeds = FOREACH @ GENERATE
     CONCAT( (chararray) group.download_speed, ' mb/sec' ),
     group.network_type,
     COUNT(Speeds)
;

DUMP @;
  • Find the user that realized the maximum number of tests. For this user, produce a table showing the evolution of her/his download/upload speeds.
NeubotTests = LOAD 'NeubotTests' using PigStorage(';') as (
                  client_address: chararray,
                  client_country: chararray,
                  lon: float,
                  lat: float,
                  client_provider: chararray,
                  mlabservername:  chararray,
                  connect_time:    float,
                  download_speed:  float,
                  neubot_version:  float,
                  platform:        chararray,
                  remote_address:  chararray,
                  test_name:       chararray,
                  timestamp:       long,
                  upload_speed:    float,
                  latency:  float,
                  uuid:     chararray,
                  asnum:    chararray,
                  region:   chararray,
                  city:     chararray,
                  hour:     int,
                  month:    int,
                  year:     int,
                  weekday:  int,
                  day:      int,
                  filedate: chararray
);

--
-- Determines the user that realized the maximum number of tests and 
-- obtain his/her download_speed log
--

Tests = FILTER @ BY (test_name matches '.*speedtest.*');

Tests_In_Barcelona = FILTER @ BY (
    city matches '.*Barcelona.*'
);

Tests_Per_User = GROUP Tests_In_Barcelona BY uuid;

Tests_Per_User = FOREACH @ GENERATE
    group AS uuid,
    COUNT(Tests_In_Barcelona) AS numberOfTests
;

MAX_NUM_TESTS = GROUP @ ALL;
MAX_NUM_TESTS = FOREACH @ GENERATE
    MAX( Tests_Per_User.numberOfTests) AS numberOfTests
;

TOP_1_USER = JOIN
    Tests_Per_User BY numberOfTests,
    MAX_NUM_TESTS  BY numberOfTests
;

TOP_1_USER = FOREACH @ GENERATE Tests_Per_User::uuid AS uuid;

TOP_1_USER_TESTS = JOIN
    Tests BY uuid,
    TOP_1_USER BY uuid
;

TOP_1_USER_TESTS = FOREACH @ GENERATE
    Tests::uuid AS uuid,
    Tests::city AS city,
    Tests::timestamp AS timestamp,
    Tests::download_speed AS download_speed
;

TOP_1_USER_TESTS = ORDER @ BY timestamp;

STORE @ INTO 'Top_1_User' USING PigStorage(',');