init
This commit is contained in:
parent
398a876b2b
commit
003336686b
18
README.md
18
README.md
@ -1,3 +1,17 @@
|
||||
# sbis
|
||||
# Парсер выгрузок из SBIS
|
||||
|
||||
Парсер выгрузок из SBIS
|
||||
Разбирает выборки из базы SBIS, выбирая телефоны, адрес, электронную почту.
|
||||
|
||||
В примере разбор выгрузки базы IT-компаний со скрытыми контактными данными.
|
||||
|
||||
## Использование
|
||||
|
||||
Требуется lua>=5.1. По умолчанию исходные файлы должны находиться в папке input директории скрипта.
|
||||
|
||||
```
|
||||
luarocks install csv
|
||||
luarocks install luafilesystem
|
||||
|
||||
lua parser.lua
|
||||
|
||||
```
|
||||
|
||||
2802
output.csv
Normal file
2802
output.csv
Normal file
File diff suppressed because it is too large
Load Diff
162
parser.lua
Normal file
162
parser.lua
Normal file
@ -0,0 +1,162 @@
|
||||
local csv = require('csv')
|
||||
local lfs = require('lfs')
|
||||
local json = require('cjson');
|
||||
|
||||
local dir = "./input" -- Директория выборки
|
||||
|
||||
|
||||
local headers = { -- Заголовки csv
|
||||
'name',
|
||||
'industry',
|
||||
'type',
|
||||
'website',
|
||||
'teams',
|
||||
'emailAddress',
|
||||
'billingAddressCity',
|
||||
'billingAddress',
|
||||
'phoneNumberOffice',
|
||||
'phoneNumberOther',
|
||||
'phoneNumberOther1',
|
||||
'phoneNumberOther2',
|
||||
'phoneNumberOther3',
|
||||
'phoneNumberOther4'
|
||||
}
|
||||
|
||||
function trim(s)
|
||||
if not s then return s end
|
||||
return (string.gsub(s,"^%s*(.-)%s*$", "%1"))
|
||||
end
|
||||
|
||||
-- чтение файла
|
||||
|
||||
local function read_file(path)
|
||||
local file = io.open(path) or error("Не могу открыть file: " .. path)
|
||||
local content = file:read("*a") or error("Не могу прочитать " .. path)
|
||||
content = string.gsub(content, '"', "'")
|
||||
content = string.gsub(content, "''", "'")
|
||||
file:close()
|
||||
file = io.open(path,'w+')
|
||||
local result = file:write(content) or error("Не могу записать " .. path)
|
||||
file:close()
|
||||
return content
|
||||
end
|
||||
|
||||
-- выборка емейлов
|
||||
|
||||
local function get_mail(mails)
|
||||
local emails = {}
|
||||
for w in string.gmatch(mails, "[^,]+") do
|
||||
if string.find(w,"info@") then
|
||||
return trim(w)
|
||||
end
|
||||
table.insert(emails,w)
|
||||
end
|
||||
if emails[1]~=nil and string.find(emails[1], "@") then
|
||||
return trim(emails[1])
|
||||
else
|
||||
return nil
|
||||
end
|
||||
end
|
||||
|
||||
-- выборка телефонов
|
||||
|
||||
local function get_phones(ph)
|
||||
local phones = {
|
||||
phoneNumberOffice ='',
|
||||
phoneNumberOther = '',
|
||||
phoneNumberOther1 = '',
|
||||
phoneNumberOther2 = '',
|
||||
phoneNumberOther3 = '',
|
||||
phoneNumberOther4 = ''
|
||||
}
|
||||
local i = 0
|
||||
for w in string.gmatch(ph, "[^,]+") do
|
||||
i = i + 1
|
||||
w = string.gsub(w,'[^%d]','')
|
||||
if i == 1 then phones['phoneNumberOffice'] = w
|
||||
elseif i == 2 then phones['phoneNumberOther'] = w
|
||||
else
|
||||
local c = (i-2)
|
||||
phones['phoneNumberOther'..(i-2)] = w
|
||||
end
|
||||
end
|
||||
return phones
|
||||
end
|
||||
|
||||
-- разбор адреса
|
||||
|
||||
local function get_address(adr)
|
||||
local address = {}
|
||||
local city = string.match(adr,'г. ([^,]+),')
|
||||
address['billingAddress'] = adr
|
||||
if city then
|
||||
address['billingAddressCity'] = city
|
||||
else
|
||||
address['billingAddressCity'] = ''
|
||||
end
|
||||
return address
|
||||
end
|
||||
|
||||
-- разбор адреса сайта
|
||||
|
||||
local function get_site(st)
|
||||
local urls = {}
|
||||
for w in string.gmatch(st, "[^,]+") do
|
||||
table.insert(urls,w)
|
||||
end
|
||||
if urls[1] ~= nil then
|
||||
return 'https://'..trim(urls[1])
|
||||
else
|
||||
return trim(st)
|
||||
end
|
||||
end
|
||||
|
||||
-- разбор направление деятельности
|
||||
|
||||
local function get_industry(st)
|
||||
local str = {}
|
||||
for w in string.gmatch(st, "[^,]+") do
|
||||
table.insert(str,w)
|
||||
end
|
||||
if str[1] ~= nil then
|
||||
return trim(str[1])
|
||||
else
|
||||
return trim(st)
|
||||
end
|
||||
end
|
||||
|
||||
-- проход файлов
|
||||
|
||||
local function parse(dir)
|
||||
file = io.open('output.csv','a+')
|
||||
for entry in lfs.dir(dir) do
|
||||
local mode = lfs.attributes(dir.."/" .. entry, "mode")
|
||||
if mode == "file" and string.find(entry,".+\.csv$") then
|
||||
print('Обрабатывается файл: '..dir.."/"..entry)
|
||||
read_file(dir.."/"..entry)
|
||||
local f = csv.open(dir.."/" .. entry,{separator=';'})
|
||||
for fields in f:lines() do
|
||||
local mail = get_mail(fields[10])
|
||||
if mail then
|
||||
local address = get_address(fields[4])
|
||||
local phones = get_phones(fields[9])
|
||||
file:write(fields[1]..';'..get_industry(fields[8])..';SBIS;'..get_site(fields[17])..';SBIS;'..mail..';'..address['billingAddressCity']..';'..address['billingAddress']..';'..phones['phoneNumberOffice']..';'..phones['phoneNumberOther']..';'..phones['phoneNumberOther1']..';'..phones['phoneNumberOther2']..';'..phones['phoneNumberOther3']..';'..phones['phoneNumberOther4']..'\n')
|
||||
end
|
||||
end
|
||||
else
|
||||
if (mode == "directory") and not (entry == ".") and not (entry == "..") then
|
||||
parse(dir.."/" .. entry)
|
||||
end
|
||||
end
|
||||
end
|
||||
file:close()
|
||||
end
|
||||
|
||||
file = io.open('output.csv','w+')
|
||||
file:write(table.concat(headers,';')..'\n')
|
||||
file:close()
|
||||
|
||||
parse(dir)
|
||||
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user