#linux #парсер #sed
Есть файл на 100к строк с примерным содержимым: 285570,KW,,,"",0.0000,0.0000,, 285590,KW,KU,Kayfan,"",0.0000,0.0000,, 285603,KW,FA,"Janub as Surrah","",0.0000,0.0000,, 285629,KW,HA,Hawalli,"",0.0000,0.0000,, 285663,KW,HA,Bayan,"",0.0000,0.0000,, 285704,KW,HA,"As Salimiyah","",0.0000,0.0000,, 285713,KW,KU,"Ash Shuwaykh","",0.0000,0.0000,, 285716,KW,AH,"Ash Shu`aybah","",0.0000,0.0000,, 285726,KW,HA,"Ar Rumaythiyah","",0.0000,0.0000,, 285728,KW,AH,"Ar Riqqah","",0.0000,0.0000,, 285787,KW,KU,"Kuwait City","",0.0000,0.0000,, 285788,KW,KU,,"",0.0000,0.0000,, 285799,KW,JA,"Al Jahra","",0.0000,0.0000,, 285803,KW,JA,"Al Hujayjah","",0.0000,0.0000,, 285810,KW,MU,"Al Funaytis","",0.0000,0.0000,, 285811,KW,AH,"Al Fahahil","",0.0000,0.0000,, 285815,KW,FA,"Al Farwaniyah","",0.0000,0.0000,, 285839,KW,AH,"Al Ahmadi","",0.0000,0.0000,, 285855,KW,,"Ad Dawhah","",0.0000,0.0000,, 285856,KW,KU,"Ad Dasmah","",0.0000,0.0000,, 285866,KW,FA,"Abraq Khaytan","",0.0000,0.0000,, 286091,OM,MA,"Al `Udhaybah ash Shamaliyah","",0.0000,0.0000,, 286245,OM,SJ,Sur,"",0.0000,0.0000,, 286282,OM,BS,Sohar,"",0.0000,0.0000,, 286592,OM,SS,"Samad ash Sha'n","",0.0000,0.0000,, 286621,OM,ZU,Salalah,"",0.0000,0.0000,, 286647,OM,BS,Saham,"",0.0000,0.0000,, 286696,OM,MA,Ruwi,"",0.0000,0.0000,, 286726,OM,SS,"Ar Raddah","",0.0000,0.0000,, Подскажите пожалуйста, как закрыть в кавычки символы между запятыми там, где их ещё нет, кроме первого и трёх последних столбцов, чтоб получилось что-то типа такого: 285603,"KW","FA","Janub as Surrah","",0.0000,0.0000,, UPD Так же в кавычках могут быть запятые и не ascii символы: 286091,OM,MA,"Al Udhaybah, ash, Shamaliyah","",0.0000,0.0000,,
Ответы
Ответ 1
Если в кавычках нет запятых, то тогда можно через перловый однострочник cat test.txt | perl -F, -lane '@F=map{ m/^"/?"$_":"\"$_\""} @F; print join(",", @F);' UPD: Если хочется хорошо делать, то лучше использовать полноценный парсер. В репах убунту/дебиана (а наверно и других) точно есть пакет csvtool. С ним задача станет простой и легкой. csvtool format '"%1","%2","%3","%4","%5","%6","%7","%8","%9"\n' /tmp/test.txt Эта тула умеет ещё много чего - фильтровать, складывать вместе. В вопросе появилось, что нужно что бы в кавычки не брало первое и три последних. Это легко исправляется csvtool format '%1,"%2","%3","%4","%5","%6",%7,%8,%9\n' /tmp/test.txtОтвет 2
awk -F ',' \ '{gsub("\"","");printf "%s,\"%s\",\"%s\",\"%s\",\"%s\",%s,%s,%s,%s\n",$1,$2,$3,$4,$5,$6,$7,$8,$9}' \ test.txt 1) Удаляем все кавычки 2) Форматируем как нужно UPD awk -F ',' \ '{gsub("\"","");printf "%s,\"%s\",\"%s\",\"",$1,$2,$3;for(i=4;i<=(NF-5);i++)printf "%s,", $i;printf "\b\",\"%s\",%s,%s,%s,%s\n",$(NF-4),$(NF-3),$(NF-2),$(NF-1),$NF}' \ test.txt Так вообще будет пофигу что у тебя в 4м поле UUPD awk -F ',|","|,"|",' \ '{printf "%s,\"%s\",\"%s\",\"",$1,$2,$3;for(i=4;i<=(NF-5);i++)printf "%s,", $i;printf "\b\",\"%s\",%s,%s,%s,%s\n",$(NF-4),$(NF-3),$(NF-2),$(NF-1),$NF}' \ test.txt А вот так даже сохранит кавычки внутри поляОтвет 3
Или сложнее вариант, явно можно упростить, но это и вы можете посмотреть как: sed -E 's#^([0-9]+),([a-zA-Z ]+?),([a-zA-Z ]+?),([a-zA-Z ]+?)#\1,\"\2\",\"\3\",\"\4\"#;s#\"\"\"#\"#' test.txt
Комментариев нет:
Отправить комментарий