Viewed   60 times

What's the best/most efficient way to extract text set between parenthesis? Say I wanted to get the string "text" from the string "ignore everything except this (text)" in the most efficient manner possible.

So far, the best I've come up with is this:

$fullString = "ignore everything except this (text)";
$start = strpos('(', $fullString);
$end = strlen($fullString) - strpos(')', $fullString);

$shortString = substr($fullString, $start, $end);

Is there a better way to do this? I know in general using regex tends to be less efficient, but unless I can reduce the number of function calls, perhaps this would be the best approach? Thoughts?

 Answers

2

i'd just do a regex and get it over with. unless you are doing enough iterations that it becomes a huge performance issue, it's just easier to code (and understand when you look back on it)

$text = 'ignore everything except this (text)';
preg_match('#((.*?))#', $text, $match);
print $match[1];
Monday, August 29, 2022
3

you have few unescaped string, that is causing the error. a simple formatting could have saved you the time.

$output = '{
    "playerId":1178,
    "percentChange":0.1,
    "averageDraftPosition":260,
    "percentOwned":0.1,
    "mostRecentNews": {
        "news":"Accardo was called up from Columbus on Monday, the Indians official Twitter feed reports",
        "spin":"Hell replace Dan Wheeler on the active roster after carrying a 2.76 ERA over 13 appearances with the Clippers to start the season.",
        "date":"Mon May 14"
    },
    "fullName":"Jeremy Accardo"
}';
$json = json_decode($output);
Thursday, December 22, 2022
 
5

I think that the best way to do what you need is to find and isolate the cells in the file and then apply OCR to each individual cell.

There are a number of solutions in SO for that, I got the code from this answer and played around a little with the parameters to get the output below (not perfect yet, but you can tweak it a little bit yourself).

import os
import cv2
import imutils

# This only works if there's only one table on a page
# Important parameters:
#  - morph_size
#  - min_text_height_limit
#  - max_text_height_limit
#  - cell_threshold
#  - min_columns


def pre_process_image(img, save_in_file, morph_size=(23, 23)):

    # get rid of the color
    pre = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    # Otsu threshold
    pre = cv2.threshold(pre, 250, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
    # dilate the text to make it solid spot
    cpy = pre.copy()
    struct = cv2.getStructuringElement(cv2.MORPH_RECT, morph_size)
    cpy = cv2.dilate(~cpy, struct, anchor=(-1, -1), iterations=1)
    pre = ~cpy

    if save_in_file is not None:
        cv2.imwrite(save_in_file, pre)
    return pre


def find_text_boxes(pre, min_text_height_limit=20, max_text_height_limit=120):
    # Looking for the text spots contours
    contours, _ = cv2.findContours(pre, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)

    # Getting the texts bounding boxes based on the text size assumptions
    boxes = []
    for contour in contours:
        box = cv2.boundingRect(contour)
        h = box[3]

        if min_text_height_limit < h < max_text_height_limit:
            boxes.append(box)

    return boxes


def find_table_in_boxes(boxes, cell_threshold=100, min_columns=3):
    rows = {}
    cols = {}

    # Clustering the bounding boxes by their positions
    for box in boxes:
        (x, y, w, h) = box
        col_key = x // cell_threshold
        row_key = y // cell_threshold
        cols[row_key] = [box] if col_key not in cols else cols[col_key] + [box]
        rows[row_key] = [box] if row_key not in rows else rows[row_key] + [box]

    # Filtering out the clusters having less than 2 cols
    table_cells = list(filter(lambda r: len(r) >= min_columns, rows.values()))
    # Sorting the row cells by x coord
    table_cells = [list(sorted(tb)) for tb in table_cells]
    # Sorting rows by the y coord
    table_cells = list(sorted(table_cells, key=lambda r: r[0][1]))

    return table_cells


def build_lines(table_cells):
    if table_cells is None or len(table_cells) <= 0:
        return [], []

    max_last_col_width_row = max(table_cells, key=lambda b: b[-1][2])
    max_x = max_last_col_width_row[-1][0] + max_last_col_width_row[-1][2]

    max_last_row_height_box = max(table_cells[-1], key=lambda b: b[3])
    max_y = max_last_row_height_box[1] + max_last_row_height_box[3]

    hor_lines = []
    ver_lines = []

    for box in table_cells:
        x = box[0][0]
        y = box[0][1]
        hor_lines.append((x, y, max_x, y))

    for box in table_cells[0]:
        x = box[0]
        y = box[1]
        ver_lines.append((x, y, x, max_y))

    (x, y, w, h) = table_cells[0][-1]
    ver_lines.append((max_x, y, max_x, max_y))
    (x, y, w, h) = table_cells[0][0]
    hor_lines.append((x, max_y, max_x, max_y))

    return hor_lines, ver_lines


if __name__ == "__main__":
    in_file = os.path.join(".", "test.jpg")
    pre_file = os.path.join(".", "pre.png")
    out_file = os.path.join(".", "out.png")

    img = cv2.imread(os.path.join(in_file))

    pre_processed = pre_process_image(img, pre_file)
    text_boxes = find_text_boxes(pre_processed)
    cells = find_table_in_boxes(text_boxes)
    hor_lines, ver_lines = build_lines(cells)

    # Visualize the result
    vis = img.copy()

    # for box in text_boxes:
    #     (x, y, w, h) = box
    #     cv2.rectangle(vis, (x, y), (x + w - 2, y + h - 2), (0, 255, 0), 1)

    for line in hor_lines:
        [x1, y1, x2, y2] = line
        cv2.line(vis, (x1, y1), (x2, y2), (0, 0, 255), 1)

    for line in ver_lines:
        [x1, y1, x2, y2] = line
        cv2.line(vis, (x1, y1), (x2, y2), (0, 0, 255), 1)

    cv2.imwrite(out_file, vis)

Wednesday, November 16, 2022
 
cepheus
 
4
import requests
from bs4 import BeautifulSoup

r = requests.get(
    'https://www.immoweb.be/en/search/apartment/for-sale/leuven/3000')
soup = BeautifulSoup(r.text, 'html.parser')

for item in soup.findAll('div', attrs={'class': 'xl-surface-ch'}):
    item = item.text.strip()
    if 'm²' in item:
        print(item[0:item.find('m')])
    else:
        item = 0
        print(item)

Output:

84 
70 
80 
32 
149 
22 
75 
30 
23 
104 
0
95 
129 
26 
55 
26 
25 
28 
33 
210 
37 
69 
36 
19 
119 
20 
20 
129 
154 
25 
Sunday, November 20, 2022
 
4

The closest I could get was using preg_split() instead:

$string = <<< STR
Are you looking for a quality real estate company? <s>Josh's real estate firm 
specializes in helping people find homes from [city][State].</s>
<s>Josh's real estate company is a boutique real estate firm serving clients 
locally.</s> In [city][state] I am sure you know how difficult it is
to find a great home, but we work closely with you to give you exactly 
what you need
STR;

print_r(preg_split(':</?s>:is', $string));

And got this output:

Array
(
    [0] => Are you looking for a quality real estate company? 
    [1] => Josh's real estate firm 
specializes in helping people find homes from [city][State].
    [2] => 

    [3] => Josh's real estate company is a boutique real estate firm serving clients 
locally.
    [4] =>  In [city][state] I am sure you know how difficult it is
to find a great home, but we work closely with you to give you exactly 
what you need
)

Except that produces an extra array element (index 2) where there's a newline between the fragments [city][State].</s> and <s>Josh's real estate company.

It'd be trivial to add some code to remove the whitespace matches though, but I'm not sure if you desire that.

Wednesday, August 10, 2022
 
codefx
 
Only authorized users can answer the search term. Please sign in first, or register a free account.
Not the answer you're looking for? Browse other questions tagged :