CS 683 Emerging Technologies
Fall Semester, 2008
Assignment 2
© 2006, All Rights Reserved, SDSU & Roger Whitney
San Diego State University -- This page last updated 10/2/08
Due October 7
Using mapreduce build a database of the words in the h1 tags of web pages. You will be given a list of urls. The urls will be be in file with one url per line. Your program will fetch the web pages and extract the words inside the <h1> tags. For this assignment words are separated by a space. You do not have to worry about punctuation. Once you have the words you will build a table and/or database that uses the words as keys. The value(s) associated with the key contains all the urls of the pages that contain the word in an h1 tag. Assume that we have two pages http://foo and http://bar with the following contents.
http:foo
<h1>cat rat</h1>
http:bar
<h1>cat mat</h1>
Then our table/database looks something like:
cat |
[http:foo, http:bar] |
mat |
[http:bar] |
rat |
[http:foo] |
That is at the key "cat" we have two entires, one for each of the pages the word is found in.
Test Pages
Here is a file containing urls to pages with h1 tags. In case you can't access the file here is the contents of the file:
http://www-rohan.sdsu.edu/faculty/whitney/public_html/cs683Ass2/sample1.html
http://www-rohan.sdsu.edu/faculty/whitney/public_html/cs683Ass2/sample2.html
http://www-rohan.sdsu.edu/faculty/whitney/public_html/cs683Ass2/sample3.html
http://www.eli.sdsu.edu/courses/fall08/cs683/assignments/assignment2Data/sample4.html
http://bismarck.sdsu.edu/~whitney/assignment2Data/sample5.html
http://www-rohan.sdsu.edu/faculty/whitney/public_html/cs683Ass2/sample6.html
http://www.eli.sdsu.edu/courses/fall08/cs683/assignments/assignment2Data/sample7.html
http://bismarck.sdsu.edu/~whitney/assignment2Data/sample8.html
http://www-rohan.sdsu.edu/faculty/whitney/public_html/cs683Ass2/sample9.html
http://www.eli.sdsu.edu/courses/fall08/cs683/assignments/assignment2Data/sample10.html
http://bismarck.sdsu.edu/~whitney/assignment2Data/sample11.html
http://www-rohan.sdsu.edu/faculty/whitney/public_html/cs683Ass2/sample12.html
Write a supervisor to keep the processes in part 1 alive and make an Erlang application for all the code so you can start/stop your program with application:start/application:stop functions.
For those that could not finish assignment one code for assignment 1 problem 3 is below. While they may not be the best way to solve the problems, it does work. Unit tests are given at the end to show how the code is issued.
Assignment 1 Code
File: matching.erl
-module(matching).
-compile(export_all).
split(Pattern,Target) ->
split(Pattern,Target,[]).
split(_Pattern,[],Acc) ->
[lists:reverse(Acc),""];
split(Pattern,Target,Acc) ->
case lists:prefix(Pattern,Target) of
true -> [lists:reverse(Acc),lists:nthtail(length(Pattern),Target)];
false -> [H|T] = Target,
split(Pattern,T, [H|Acc])
end.
File: html_parsing.erl
-module(html_parsing).
-compile(export_all).
tag_contents ("",_) ->
[];
tag_contents (_Tag,"") ->
[];
tag_contents (Tag,HtmlString) ->
io:format("Start ~p ~p~n", [Tag, HtmlString]),
StartTag = "<" ++ string:strip(Tag) ++ ">",
EndTag = "</" ++ string:strip(Tag) ++ ">",
tag_contents(string:to_lower(StartTag),string:to_lower(EndTag),string:to_lower(HtmlString), []).
tag_contents(_StartTag,_EndTag,"", AllContents) ->
lists:reverse(AllContents);
tag_contents(StartTag,EndTag,HtmlString, AllContents) ->
io:format("~p ~p ~p~n", [StartTag, EndTag, HtmlString]),
[_PreTag,TagContentsPlus] = matching:split(StartTag, HtmlString),
[TagContents,Remainder] = matching:split(EndTag, TagContentsPlus),
if
TagContents == "" ->
tag_contents(StartTag,EndTag, Remainder, AllContents);
true -> tag_contents(StartTag,EndTag, Remainder, [TagContents|AllContents])
end.
File: assignment1Test.erl
-include_lib("eunit/include/eunit.hrl").
-module(assignment1Test).
-import(matching).
pattern_split_test_() ->
[
?_assert(["ac","abc"] == matching:split("ab","acababc")),
?_assert(["","ababc"] == matching:split("ac","acababc")),
?_assert(["acaba",""] == matching:split("bc","acababc")),
?_assert(["acababc",""] == matching:split("aa","acababc")),
?_assert(["","acababc"] == matching:split("","acababc")),
?_assert(["",""] == matching:split("aa","")),
?_assert(["1ab", "c"] == matching:split("A", "1abAc"))
].
tag_contents_test_() ->
[
?_assert([] == html_parsing:tag_contents("h1","")),
?_assert([] == html_parsing:tag_contents("h1","cat rat")),
?_assert(["cat"] == html_parsing:tag_contents("h1","<h1>cat</h1>")),
?_assert(["cat"] == html_parsing:tag_contents(" H1 ","<h1>cat</h1>")),
?_assert(["a", "c"] == html_parsing:tag_contents(" h1 ","<H1>a</h1>b<h1>c</h1>d")),
?_assert(["d", "g"] == html_parsing:tag_contents("h2","<h1>a</h1>b<p>c<h2>d</h2>e</p>f<h2>g</h2>"))
].