| 1 | -module(tbray6). |
|---|
| 2 | -compile([native]). |
|---|
| 3 | -export([start/1]). |
|---|
| 4 | |
|---|
| 5 | -define(BUFFER_SIZE, (1024 * 10000)). |
|---|
| 6 | |
|---|
| 7 | start(FileName) -> |
|---|
| 8 | Dicts = [wait_result(Worker) || Worker <- read_file(FileName)], |
|---|
| 9 | print_result(merge_dicts(Dicts)). |
|---|
| 10 | |
|---|
| 11 | read_file(FileName) -> |
|---|
| 12 | {ok, File} = file:open(FileName, [raw, binary]), |
|---|
| 13 | read_file_1(File, 0, []). |
|---|
| 14 | read_file_1(File, Offset, Workers) -> |
|---|
| 15 | case file:pread(File, Offset, ?BUFFER_SIZE) of |
|---|
| 16 | eof -> |
|---|
| 17 | file:close(File), |
|---|
| 18 | Workers; |
|---|
| 19 | {ok, Bin} -> |
|---|
| 20 | DataL = split_on_last_newline(Bin), |
|---|
| 21 | Worker = spawn_worker(self(), fun scan_chunk/1, {Bin, DataL}), |
|---|
| 22 | read_file_1(File, Offset + DataL + 1, [Worker | Workers]) |
|---|
| 23 | end. |
|---|
| 24 | |
|---|
| 25 | split_on_last_newline(Bin) -> split_on_last_newline_1(Bin, size(Bin)). |
|---|
| 26 | split_on_last_newline_1(Bin, S) when S > 0 -> |
|---|
| 27 | case Bin of |
|---|
| 28 | <<_:S/binary,$\n,_/binary>> -> S; |
|---|
| 29 | _ -> split_on_last_newline_1(Bin, S - 1) |
|---|
| 30 | end; |
|---|
| 31 | split_on_last_newline_1(_, S) -> S. |
|---|
| 32 | |
|---|
| 33 | |
|---|
| 34 | %% This version will split Bin to Rest, and apply to Rest recursively |
|---|
| 35 | scan_chunk({Bin, DataL}) -> scan_chunk_1(Bin, DataL, 0, dict:new()). |
|---|
| 36 | scan_chunk_1(Bin, DataL, S, Dict) when S < DataL - 34 -> |
|---|
| 37 | case Bin of |
|---|
| 38 | <<_:S/binary,"GET /ongoing/When/",_,_,_,$x,$/,_,_,_,_,$/,_,_,$/,_,_,$/,_/binary>> -> |
|---|
| 39 | case match_until_space_newline(Bin, S + 34) of |
|---|
| 40 | {true, E} -> |
|---|
| 41 | Skip = S + 23, L = E - Skip, |
|---|
| 42 | <<_:Skip/binary,Key:L/binary,Rest/binary>> = Bin, |
|---|
| 43 | scan_chunk_1(Rest, DataL - E, 0, dict:update_counter(Key, 1, Dict)); |
|---|
| 44 | {false, E} -> |
|---|
| 45 | <<_:E/binary,Rest/binary>> = Bin, |
|---|
| 46 | scan_chunk_1(Rest, DataL - E, 0, Dict) |
|---|
| 47 | end; |
|---|
| 48 | _ -> scan_chunk_1(Bin, DataL, S + 1, Dict) |
|---|
| 49 | end; |
|---|
| 50 | scan_chunk_1(_, _, _, Dict) -> Dict. |
|---|
| 51 | |
|---|
| 52 | match_until_space_newline(Bin, S) when S < size(Bin) -> |
|---|
| 53 | case Bin of |
|---|
| 54 | <<_:S/binary,10,_/binary>> -> {false, S}; |
|---|
| 55 | <<_:S/binary,$.,_/binary>> -> {false, S}; |
|---|
| 56 | <<_:S/binary,_,$ ,_/binary>> -> {true, S + 1}; |
|---|
| 57 | _ -> match_until_space_newline(Bin, S + 1) |
|---|
| 58 | end; |
|---|
| 59 | match_until_space_newline(_, S) -> {false, S}. |
|---|
| 60 | |
|---|
| 61 | spawn_worker(Parent, F, A) -> |
|---|
| 62 | erlang:spawn_monitor(fun() -> Parent ! {self(), F(A)} end). |
|---|
| 63 | |
|---|
| 64 | wait_result({Pid, Ref}) -> |
|---|
| 65 | receive |
|---|
| 66 | {'DOWN', Ref, _, _, normal} -> receive {Pid, Result} -> Result end; |
|---|
| 67 | {'DOWN', Ref, _, _, Reason} -> exit(Reason) |
|---|
| 68 | end. |
|---|
| 69 | |
|---|
| 70 | merge_dicts([D1,D2|Rest]) -> |
|---|
| 71 | merge_dicts([dict:merge(fun(_, V1, V2) -> V1 + V2 end, D1, D2) | Rest]); |
|---|
| 72 | merge_dicts([D]) -> D. |
|---|
| 73 | |
|---|
| 74 | print_result(Dict) -> |
|---|
| 75 | SortedList = lists:reverse(lists:keysort(2, dict:to_list(Dict))), |
|---|
| 76 | [io:format("~b\t: ~p~n", [V, K]) || {K, V} <- lists:sublist(SortedList, 10)]. |
|---|
| 77 | |
|---|