Author Archives: admin

Script

##################################################
## Developer Tools
##################################################

sudo apt-get install -y ssh build-essential git openjdk-7-jdk

##################################################
## Lightning-Server
##################################################
#————————————-
#– Docker-Engine: install
#————————————-
sudo apt-get update
sudo apt-get install apt-transport-https ca-certificates

sudo apt-key adv –keyserver hkp://p80.pool.sks-keyservers.net:80 –recv-keys 58118E89F3A912897C070ADBF76221572C52609D
echo “deb https://apt.dockerproject.org/repo ubuntu-trusty main” | sudo tee /etc/apt/sources.list.d/docker.list > /dev/null

sudo apt-get update
sudo apt-get install -y docker-engine
sudo apt-get -y autoremove

#————————————-
#– Docker-Engine: config & test
#————————————-
sudo usermod -aG docker $(whoami)
sudo docker run hello-world

#————————————-
#– Lightning: install
#————————————-
sudo docker run -i -t -p 3000:3000 lightningviz/lightning:latest

#————————————-
#– Lightning: config run-script
#————————————-
echo ‘#!/bin/bash’ | sudo tee -a /usr/local/bin/lightning-server > /dev/null
echo “docker run -i -t -p 3000:3000 lightningviz/lightning:latest” | sudo tee -a /usr/local/bin/lightning-server > /dev/null

sudo chmod +x /usr/local/bin/lightning-server

#————————————-
#– Reboot (needed for running docker without sudo)
#————————————-
sudo reboot

##################################################
## Rapidminer (manual install)
##################################################

# Download from https://rapidminer.com

#————————————-
#– Install (assumes rapidminer-studio is in current directory)
#————————————-
sudo mv rapidminer-studio /usr/local/rapidminer-studio

#————————————-
#– Config
#————————————-
echo ‘#!/bin/bash’ | sudo tee -a /usr/local/bin/rapidminer-studio > /dev/null
echo ‘export RAPIDMINER_HOME=/usr/local/rapidminer-studio/’ | sudo tee -a /usr/local/bin/rapidminer-studio > /dev/null
echo ‘/usr/local/rapidminer-studio/RapidMiner-Studio.sh’ | sudo tee -a /usr/local/bin/rapidminer-studio > /dev/null

sudo chmod +x /usr/local/bin/rapidminer-studio
##################################################
## NoSQL Databases
##################################################
#————————————-
#– MongoDB
#————————————-

sudo apt-key adv –keyserver hkp://keyserver.ubuntu.com:80 –recv EA312927
echo “deb http://repo.mongodb.org/apt/ubuntu trusty/mongodb-org/3.2 multiverse” | sudo tee /etc/apt/sources.list.d/mongodb-org-3.2.list
sudo apt-get update
sudo apt-get install -y mongodb-org

#————————————-
#– CouchDB
#————————————-

sudo apt-get update
sudo apt-get install software-properties-common -y
sudo add-apt-repository ppa:couchdb/stable -y
sudo apt-get update
sudo apt-get install couchdb -y
curl localhost:5984

# CouchDB Client 4 Python
pip install couchdb

##################################################
## Pig Latin
##################################################

# Download from http://pig.apache.org/releases.html

#————————————-
#– Install (assumes tar is in current directory)
#————————————-

tar -xvf pig-* && rm pig-*.tar*
sudo mkdir /usr/local/apache/
sudo mv pig-* /usr/local/apache/pig

printf “\n\n” | tee -a ~/.bashrc > /dev/null
echo ‘# Apache Pig Latin’ | tee -a ~/.bashrc > /dev/null
echo ‘export JAVA_HOME=/usr/lib/jvm/java-1.7.0-openjdk-amd64/’ | tee -a ~/.bashrc > /dev/null
echo ‘export PIG_HOME=/usr/local/apache/pig’ | tee -a ~/.bashrc > /dev/null
echo ‘export PATH=$PATH:$PIG_HOME/bin’ | tee -a ~/.bashrc > /dev/null

source ~/.bashrc

##################################################
## Anaconda
##################################################

# Download from https://www.continuum.io/downloads
# & follow instructions

# when installed on different folder
chmod -R 777 /usr/local/anaconda2/

##################################################
## Cesium JS
##################################################

#————————————-
#– Node
#————————————-

curl -sL https://deb.nodesource.com/setup_4.x | sudo -E bash –
sudo apt-get install -y nodejs npm

sudo ln -s /usr/bin/nodejs /usr/local/bin/node
sudo ln -s /usr/bin/npm /usr/local/bin/npm
#————————————-
#– Cesium & Express
#————————————-
npm install cesium
npm install express compression request yargs
#————————————-
#– Google Chrome
#————————————-

# Manual install: https://www.google.com/chrome/browser/desktop/
##################################################
## VM Customization
##################################################

#————————————-
#– Big Data Analytics (bda) sudo user
#————————————-
PASSWORD=bigdata
sudo useradd -c ‘BD Analytics’ -p $(openssl passwd -1 $PASSWORD) bda
sudo usermod -aG sudo bda
sudo usermod -aG docker bda

sudo cp -R /home/$(whoami) /home/$(whoami).tmp
sudo mv /home/$(whoami).tmp /home/bda
sudo chown bda:bda /home/bda -R

#————————————-
#– Enable autologin (bda user)
#————————————-

sudo rm /etc/lightdm/lightdm.conf

echo ‘[SeatDefaults]’ | sudo tee -a /etc/lightdm/lightdm.conf > /dev/null
echo ‘autologin-guest=false’ | sudo tee -a /etc/lightdm/lightdm.conf > /dev/null
echo ‘autologin-user=bda’ | sudo tee -a /etc/lightdm/lightdm.conf > /dev/null
echo ‘autologin-user-timeout=0’ | sudo tee -a /etc/lightdm/lightdm.conf > /dev/null
echo ‘autologin-session=lightdm-autologin’ | sudo tee -a /etc/lightdm/lightdm.conf > /dev/null

Answers Pig

  • Filter the speedtest conducted in Barcelona or Madrid. Then list the internet providers working in those cities.
NeubotTests = LOAD 'NeubotTests' using PigStorage(';') as (
                  client_address: chararray,
                  client_country: chararray,
                  lon: float,
                  lat: float,
                  client_provider: chararray,
                  mlabservername:  chararray,
                  connect_time:    float,
                  download_speed:  float,
                  neubot_version:  float,
                  platform:        chararray,
                  remote_address:  chararray,
                  test_name:       chararray,
                  timestamp:       long,
                  upload_speed:    float,
                  latency:  float,
                  uuid:     chararray,
                  asnum:    chararray,
                  region:   chararray,
                  city:     chararray,
                  hour:     int,
                  month:    int,
                  year:     int,
                  weekday:  int,
                  day:      int,
                  filedate: chararray
);

--
-- A1: Internet Providers in 'Barcelona' or 'Madrid' where speedtests were conducted
--

SpeedTests = FILTER @ BY (test_name matches '.*speedtest.*');

SpeedTests = FILTER @ BY (
    city matches '.*Barcelona.*' OR
    city matches '.*Madrid.*'
);

Providers = FOREACH @ GENERATE city, client_provider;
Providers = DISTINCT @;

DUMP @;
  • List the names and the IP ranges of the internet providers located in Barcelona. For this you need to use the IPtoNumber user defined function (cf. NeubotTestsUDFs.jar).
NeubotTests = LOAD 'NeubotTests' using PigStorage(';') as (
                  client_address: chararray,
                  client_country: chararray,
                  lon: float,
                  lat: float,
                  client_provider: chararray,
                  mlabservername:  chararray,
                  connect_time:    float,
                  download_speed:  float,
                  neubot_version:  float,
                  platform:        chararray,
                  remote_address:  chararray,
                  test_name:       chararray,
                  timestamp:       long,
                  upload_speed:    float,
                  latency:  float,
                  uuid:     chararray,
                  asnum:    chararray,
                  region:   chararray,
                  city:     chararray,
                  hour:     int,
                  month:    int,
                  year:     int,
                  weekday:  int,
                  day:      int,
                  filedate: chararray
);

--
-- A2: Internet Providers in Barcelona and their IP range based on the speedtests observations
--

SpeedTests = FILTER @ BY (test_name matches '.*speedtest.*');

SpeedTests = FILTER @ BY (
    city matches '.*Barcelona.*'
);

Providers = FOREACH @ GENERATE
    city,
    client_provider,
    IPtoNumber(client_address) AS ip
;

Providers = GROUP @ BY client_provider;

Providers_IP_Range = FOREACH @ GENERATE
    group,
    NumberToIP( MIN(Providers.ip) ),
    NumberToIP( MAX(Providers.ip) )
;

DUMP @;
  • Group the speedtest based on the user network infrastructure (e.g., 3G/4G vs ADSL). For this you can assume some max bandwidth (e.g., 21Mb/sec for ADSL).
NeubotTests = LOAD 'NeubotTests' using PigStorage(';') as (
                  client_address: chararray,
                  client_country: chararray,
                  lon: float,
                  lat: float,
                  client_provider: chararray,
                  mlabservername:  chararray,
                  connect_time:    float,
                  download_speed:  float,
                  neubot_version:  float,
                  platform:        chararray,
                  remote_address:  chararray,
                  test_name:       chararray,
                  timestamp:       long,
                  upload_speed:    float,
                  latency:  float,
                  uuid:     chararray,
                  asnum:    chararray,
                  region:   chararray,
                  city:     chararray,
                  hour:     int,
                  month:    int,
                  year:     int,
                  weekday:  int,
                  day:      int,
                  filedate: chararray
);

--
-- A3: Speedtests (conducted in Barcelona) organized by network type: Mobile vs ADSL
--

SpeedTests = FILTER @ BY (test_name matches '.*speedtest.*');
SpeedTests = FILTER @ BY (
    city matches '.*Barcelona.*'
);

SPLIT @ INTO
    Mobile_Tests IF (
        download_speed > 21000000 -- 21 Mb / sec
    ),

    ADSL_Tests IF (
        download_speed <= 21000000 -- 21 Mb / sec
    )
;

MobileSpeeds = FOREACH Mobile_Tests GENERATE
     CEIL(download_speed / 1000000) AS download_speed,
     'mobile' AS network_type: chararray
;

ADSLSpeeds = FOREACH ADSL_Tests GENERATE
     CEIL(download_speed / 1000000) AS download_speed,
     'adsl' AS network_type: chararray
;

Speeds = UNION MobileSpeeds, ADSLSpeeds;
Speeds = GROUP @ BY (download_speed, network_type);
Speeds = FOREACH @ GENERATE
     CONCAT( (chararray) group.download_speed, ' mb/sec' ),
     group.network_type,
     COUNT(Speeds)
;

DUMP @;
  • Find the user that realized the maximum number of tests. For this user, produce a table showing the evolution of her/his download/upload speeds.
NeubotTests = LOAD 'NeubotTests' using PigStorage(';') as (
                  client_address: chararray,
                  client_country: chararray,
                  lon: float,
                  lat: float,
                  client_provider: chararray,
                  mlabservername:  chararray,
                  connect_time:    float,
                  download_speed:  float,
                  neubot_version:  float,
                  platform:        chararray,
                  remote_address:  chararray,
                  test_name:       chararray,
                  timestamp:       long,
                  upload_speed:    float,
                  latency:  float,
                  uuid:     chararray,
                  asnum:    chararray,
                  region:   chararray,
                  city:     chararray,
                  hour:     int,
                  month:    int,
                  year:     int,
                  weekday:  int,
                  day:      int,
                  filedate: chararray
);

--
-- Determines the user that realized the maximum number of tests and 
-- obtain his/her download_speed log
--

Tests = FILTER @ BY (test_name matches '.*speedtest.*');

Tests_In_Barcelona = FILTER @ BY (
    city matches '.*Barcelona.*'
);

Tests_Per_User = GROUP Tests_In_Barcelona BY uuid;

Tests_Per_User = FOREACH @ GENERATE
    group AS uuid,
    COUNT(Tests_In_Barcelona) AS numberOfTests
;

MAX_NUM_TESTS = GROUP @ ALL;
MAX_NUM_TESTS = FOREACH @ GENERATE
    MAX( Tests_Per_User.numberOfTests) AS numberOfTests
;

TOP_1_USER = JOIN
    Tests_Per_User BY numberOfTests,
    MAX_NUM_TESTS  BY numberOfTests
;

TOP_1_USER = FOREACH @ GENERATE Tests_Per_User::uuid AS uuid;

TOP_1_USER_TESTS = JOIN
    Tests BY uuid,
    TOP_1_USER BY uuid
;

TOP_1_USER_TESTS = FOREACH @ GENERATE
    Tests::uuid AS uuid,
    Tests::city AS city,
    Tests::timestamp AS timestamp,
    Tests::download_speed AS download_speed
;

TOP_1_USER_TESTS = ORDER @ BY timestamp;

STORE @ INTO 'Top_1_User' USING PigStorage(',');

Answers CouchDB

  • Define a view in MapReduce that contains, for each theatre, the films presented in it. Hint: You do not need a reduce here.
function(doc) {
    if(doc.feed.theaterShowtimes[0]) {
        var movieTheater = doc.feed.theaterShowtimes[0].place.theater;
        var moviesOnShow = doc.feed.theaterShowtimes[0].movieShowtimes;

        var movies = [];
        for(var i=0; i < moviesOnShow.length; i++) {
            var movie = moviesOnShow[i].onShow.movie;
            movies.push(movie.title);
        } // for

        emit(movieTheater.name, movies);
    } // if
} // func

  • Modify your previous answer and filter the theaters outside Grenoble (e.g., do not include the theatres in Saint Martin d’Hères).
function(doc) {
    if(doc.feed.theaterShowtimes[0]) {
        var movieTheater = doc.feed.theaterShowtimes[0].place.theater;
        var moviesOnShow = doc.feed.theaterShowtimes[0].movieShowtimes;

        var movies = [];
        if(movieTheater.city == "Grenoble") {
             for(var i=0; i < moviesOnShow.length; i++) {
                 var movie = moviesOnShow[i].onShow.movie;
                 movies.push(movie.title); 
             } // for

             emit(movieTheater.name, movies);

         } // if
     } // if
} // func
  • Give the number of films that each theatre is presenting. Hint: You need a reduce here.
// Map
function(doc) {
    if(doc.feed.theaterShowtimes[0]) {
        var movieTheater = doc.feed.theaterShowtimes[0].place.theater;
        var moviesOnShow = doc.feed.theaterShowtimes[0].movieShowtimes;

        for(var i=0; i < moviesOnShow.length; i++) {
            emit(movieTheater.name, 1);
        } // for

     } // if
} // func


// Reduce
function (key, values) {
    return sum(values) ;
}
  • Give the list of films with a press rating higher than 4 stars. Attention: filter duplicates.
// Map
function(doc) {
    if(doc.feed.theaterShowtimes[0]) {
        var movies = doc.feed.theaterShowtimes[0].movieShowtimes;
        for(var i=0; i < movies.length; i++) {
            var movie = movies[i].onShow.movie;
            if(movie.statistics.pressRating > 4) {
                emit([movie.title, movie.statistics.pressRating], null);
            }
         } // for
    } // if
} // func

 
// Reduce
function (keys,values) {
    return null ;
}
  • Give the list of films presented 2 years ago (10.12.2011), and for each film, the theatre where it was presented and its schedule.
function(doc) {
    if(doc.feed.theaterShowtimes[0]) {
        var movieTheater = doc.feed.theaterShowtimes[0].place.theater;
        var moviesOnShow = doc.feed.theaterShowtimes[0].movieShowtimes; 

        for(var i=0; i < moviesOnShow.length; i++) {
            var movie = {
                "title":   moviesOnShow[i].onShow.movie.title,
                "theater": movieTheater.name,
                "date":    moviesOnShow[i].scr[0].d,
                "schedule": []
            };

            if(movie.date == "2011-12-09") {
                var showTime = moviesOnShow[i].scr[0].t;
                for(var j=0; j < showTime.length; j++) {
                    movie.schedule.push( showTime[j].$ );
                }
                emit(movie.title, movie);
            } // if
        }  // for
    }  // if
}  // func

  • BONUS! Give the list of films, and for every film, the list of theatres that present it (this question is a challenge but we encourage you to try to solve it).
// Map
function(doc) {
    if(doc.feed.theaterShowtimes[0]) {
        var movieTheater = doc.feed.theaterShowtimes[0].place.theater;
        var moviesOnShow = doc.feed.theaterShowtimes[0].movieShowtimes;
        for(var i=0; i < moviesOnShow.length; i++) {
            var movie = moviesOnShow[i].onShow.movie;
            emit(movie.title, movieTheater); 
        } // for
    } // if
} // func


// Reduce
function(keys, values) {
    var movieTheaters = [] ; 
    for(var i=0; i<values.length; i++) {
        var theater = values[i].name;
        if(!contains(movieTheaters, theater)) {
            movieTheaters.push(theater);
        }
    } // for
    return [ movieTheaters.length, movieTheaters ];
} // func
 
function contains(array, element) {
    var isContained = false;
    for(var i=0; i<array.length; i++) {
        if(array[i] == element){
            isContained = true;
            break;
        }
    } // for
    return isContained;
} // func