coding: Поиск необходмых заголовков

четверг, 9 апреля 2020 г.

Поиск необходмых заголовков

#java #алгоритм #jsoup

                    
Есть метод, который считает хедеры для таблицы

private List headers(String html)
{
    Document doc = Jsoup.parse(html);
    ArrayList result = new ArrayList<>();
    Elements header;
    Element firstThead = doc.select("thead").first();
    Elements trOfFirstThead = firstThead.children();
    for (Element tr : firstThead.children())
    {
        Elements select = tr.select("th");
        for (Element th : select)
        {
            String s = th.attributes().get("rowspan");
            if (!s.isEmpty() && s.equals(String.valueOf(trOfFirstThead.size())))
            {
                result.add(th.text());
            }
        }
    }
    header = trOfFirstThead.last().children();

    for (Element element : header)
    {
        if (element.tag().getName().equals(tag_th))
        {
            result.add(element.text());
        }
    }
    return result;
}


Суть метода такова - на вход поступает таблица, у которой есть раздел thead и из
него необходимо получить хедеры в виде коллекции строк. Если хедеры расположены в несколько
рядов, то выбирается нижний ряд, и по нему берутся названия. 

Данный алгоритм работает для таблиц, представленых под номерами 1, 2, и 3 ( см. вложение).
Но для таблицы типа 4 хедеры находятся не правильно.

Требуемая коллекция : 

h4 h10 h11 h12 h6 h7.

При работе алгоритма получается следующая коллекция : 

h4 h7 h10 h11 h12.

Прошу помочь советом/алгоритмом, как можно реализовать нужное поведение.

P.S. исходный код таблиц.



    
    	
    
    
    
    	
    		
    			
    				Таблица 1
    				
    					
    					
    						h1
    						h2
    						h3
    					
    					
    					
    					
    						1
    						2
    						3
    					
    					
    						4
    						5
    						6
    					
    					
    						7
    						8
    						9
    					
    					
    				
    			
    		
          
    		
    			
    				Таблица 2
    				
    					
    					
    						h4
    						h5
    					
    					
    						h1
    						h2
    						h3
    					

    					
    					
    					
    						1
    						2
    						3
    					
    					
    						4
    						5
    						6
    					
    					
    						7
    						8
    						9
    					
    					
    				
    			
    		
      
    		
    			
    				Таблица 3
    				
    					
    					
    						h4
    						h5
    					
    					
    						h1
    						h2
    					

    					
    					
    					
    						1
    						2
    						3
    					
    					
    						4
    						5
    						6
    					
    					
    						7
    						8
    						9
    					
    					
    				
    			
    			
    		
    		
    			
    				Таблица 4
    				
    					
    					
    						h4
    						h5
    						h7
    					
    					
    						h1
    						h2
    						h8
    						h6
    					
    					
    						h10
    						h11
    						h12
    					

    					
    					
    					
    						1
    						2
    						3
    						4
    						5
    						6
    					
    					
    						7
    						8
    						9
    						10
    						11
    						12

h1	h2	h3
1	2	3
4	5	6
7	8	9

h4	h5
1	2	3
4	5	6
7	8	9

h4	h5
1	2	3
4	5	6
7	8	9

h4	h5	h7
1	2	3	4	5	6
7	8	9	10	11	12

Ответы

Ответ 1


По-моему, решением будет реализация в каком-то объеме прописанного в HTML5 алгоритма
построения таблицы, благо обработка  там простая. Вот этот код выдает нужный
результат на ваших примерах:

static class TableHeader {
    private String[][] cells;
    private int y_height = 0;
    private int x_width = 0;

    public TableHeader( int rows, int columns, Element thead ) {
        cells = new String[rows][columns];

        parseTHead( thead );
    }

    private void ensureCapacity( int rows, int columns ) {
        if ( rows <= cells.length && columns <= cells[0].length ) return;

        int nRows = Math.max( cells.length, rows );
        int nColumns = Math.max( cells[0].length, columns );

        String[][] newCells = new String[nRows][nColumns];
        for ( int row = 0; row < cells.length; row++ ) {
            System.arraycopy(cells[row], 0, newCells[row], 0, cells[row].length );
        }

        cells = newCells;
    }

    private void fill( String cellValue, int row, int col, int rowspan, int colspan ) {
        ensureCapacity( row + rowspan, col + colspan );
        for ( int r = 0; r < rowspan; r++ ) {
            for ( int c = 0; c < colspan; c++ ) {
                cells[row + r][col + c] = cellValue;
            }
        }
    }


    private int cellSpan( Element th, String attrName ) {
        String attrValue = th.attr( attrName );
        int result = 1;
        if ( attrValue.isEmpty() ) return result;
        try {
            result = Integer.parseInt( attrValue );
        } catch ( NumberFormatException ex ) { /*ignore*/ };
        return result;
    }

    // http://www.w3.org/TR/html5/tabular-data.html#algorithm-for-processing-row-groups
    private void parseTHead( Element thead ) {
        //int y_start = y_height; // #1
        int y_current = 0;
        final Elements rows = thead.children().select( "tr" );
        final int rowsNumber = rows.size();
        ensureCapacity(rowsNumber, x_width);
        for ( Element tr : rows ) { // #2
            //http://www.w3.org/TR/html5/tabular-data.html#algorithm-for-processing-rows
            if ( y_height == y_current ) {
                y_height += 1;
            }
            int x_current = 0;
            //TODO: Run the algorithm for growing 'downward-growing cells'.
            for ( Element currentCell : tr.children().select( "td, th" ) ) {
                //6. While xcurrent is less than xwidth and the slot with coordinate
(xcurrent, ycurrent)
                //  already has a cell assigned to it, increase xcurrent by 1.
                while ( x_current < x_width && cells[y_current][x_current] != null
) x_current += 1;
                if ( x_current == x_width ) {
                    x_width += 1; //# 7
                }
                int colspan = cellSpan( currentCell, "colspan" ); //#8 
                int rowspan = cellSpan( currentCell, "rowspan" ); //#9
                if (colspan == 0) colspan = 1; 
                //TODO: 10. If rowspan is zero and the table element's Document is
not set to quirks mode,
                // then let 'cell grows downward' be true, and set rowspan to 1.
                // Otherwise, let cell grows downward be false.
                //FIXME: не позволяем rowspan создавать больше строк, чем есть 
                //  как этот вопрос решен в стандарте?
                rowspan = Math.min( rowsNumber - y_current, rowspan );
                if ( x_width < x_current + colspan ) x_width = x_current + colspan;
                if ( y_height < y_current + rowspan ) y_height = y_current + rowspan;
                // TODO: If any of the slots involved already had a cell covering them,
                //   then this is a table model error.
                //   Those slots now have two cells overlapping.
                fill( currentCell.text(), y_current, x_current, rowspan, colspan
); // #13
                // TODO: If 'cell grows downward' is true, then add the tuple
                //   {c, xcurrent, colspan} to the list of 'downward-growing cells'.
                x_current += colspan; //#15
            }
            y_current += 1;
        }
    }

    public List lastRow() {
        return Arrays.stream( cells[y_height - 1]).limit( x_width ).collect( Collectors.toList());
    }
}

private static List headers3(String html) {
    Document doc = Jsoup.parse(html);

    Element firstThead = doc.select("thead").first();

    TableHeader header = new TableHeader(10, 10, firstThead);

    return header.lastRow();
}


В реализации не обрабатывается случай с rowspan="0", вроде как все манипуляции с
шириной и высотой можно закинуть в fill, и ни на чем, кроме ваших примеров я ее не
проверял. В качестве бонуса, такой подход позволяет легко получить полный заголовок
столбца.

upd: есть очевидная проблема со случаем, когда y_current + rowspan превышает количество
, в результате fill создает лишние ряды, чего в браузере не наблюдается. С colspan
наверняка та же ситуация. Пока просто ограничил rowspan сверху, но я явно чего-то не
понимаю в стандарте.

coding

Страницы

Поиск по вопросам

четверг, 9 апреля 2020 г.

Поиск необходмых заголовков

Ответы

Ответ 1

Комментариев нет:

Отправить комментарий

Страницы

Поиск по вопросам

четверг, 9 апреля 2020 г.

Поиск необходмых заголовков

Ответы

Ответ 1

Комментариев нет:

Отправить комментарий

четверг, 9 апреля 2020 г.